aboutsummaryrefslogtreecommitdiffstats
path: root/vnet/vnet
diff options
context:
space:
mode:
Diffstat (limited to 'vnet/vnet')
-rw-r--r--vnet/vnet/api_errno.h80
-rw-r--r--vnet/vnet/buffer.h303
-rw-r--r--vnet/vnet/classify/README180
-rw-r--r--vnet/vnet/classify/input_acl.c293
-rw-r--r--vnet/vnet/classify/input_acl.h54
-rw-r--r--vnet/vnet/classify/ip_classify.c384
-rw-r--r--vnet/vnet/classify/vnet_classify.c1895
-rw-r--r--vnet/vnet/classify/vnet_classify.h414
-rw-r--r--vnet/vnet/config.c322
-rw-r--r--vnet/vnet/config.h160
-rw-r--r--vnet/vnet/devices/dpdk/cli.c974
-rw-r--r--vnet/vnet/devices/dpdk/device.c1483
-rw-r--r--vnet/vnet/devices/dpdk/dpdk.h515
-rw-r--r--vnet/vnet/devices/dpdk/dpdk_priv.h437
-rw-r--r--vnet/vnet/devices/dpdk/init.c1728
-rw-r--r--vnet/vnet/devices/dpdk/node.c2010
-rw-r--r--vnet/vnet/devices/dpdk/threads.c378
-rw-r--r--vnet/vnet/devices/dpdk/threads.h30
-rw-r--r--vnet/vnet/devices/dpdk/vhost_user.c1550
-rw-r--r--vnet/vnet/devices/ssvm/node.c323
-rw-r--r--vnet/vnet/devices/ssvm/ssvm_eth.c475
-rw-r--r--vnet/vnet/devices/ssvm/ssvm_eth.h128
-rw-r--r--vnet/vnet/devices/virtio/vhost-user.c1957
-rw-r--r--vnet/vnet/devices/virtio/vhost-user.h222
-rw-r--r--vnet/vnet/dhcp/client.c960
-rw-r--r--vnet/vnet/dhcp/client.h118
-rw-r--r--vnet/vnet/dhcp/packet.h61
-rw-r--r--vnet/vnet/dhcp/proxy.h92
-rw-r--r--vnet/vnet/dhcp/proxy_error.def30
-rw-r--r--vnet/vnet/dhcp/proxy_node.c1144
-rw-r--r--vnet/vnet/dhcpv6/packet.h183
-rw-r--r--vnet/vnet/dhcpv6/proxy.h88
-rw-r--r--vnet/vnet/dhcpv6/proxy_error.def27
-rw-r--r--vnet/vnet/dhcpv6/proxy_node.c1046
-rw-r--r--vnet/vnet/dpdk_replication.h107
-rw-r--r--vnet/vnet/ethernet/arp.c1853
-rw-r--r--vnet/vnet/ethernet/arp_packet.h134
-rw-r--r--vnet/vnet/ethernet/cli.c124
-rw-r--r--vnet/vnet/ethernet/error.def46
-rw-r--r--vnet/vnet/ethernet/ethernet.h450
-rw-r--r--vnet/vnet/ethernet/format.c301
-rw-r--r--vnet/vnet/ethernet/init.c95
-rw-r--r--vnet/vnet/ethernet/interface.c462
-rw-r--r--vnet/vnet/ethernet/mac_swap.c380
-rw-r--r--vnet/vnet/ethernet/node.c1112
-rw-r--r--vnet/vnet/ethernet/packet.h95
-rw-r--r--vnet/vnet/ethernet/pg.c177
-rw-r--r--vnet/vnet/ethernet/types.def112
-rw-r--r--vnet/vnet/flow/flow_report.c266
-rw-r--r--vnet/vnet/flow/flow_report.h107
-rw-r--r--vnet/vnet/flow/flow_report_sample.c397
-rw-r--r--vnet/vnet/flow/flow_report_sample.h26
-rw-r--r--vnet/vnet/flow/ipfix_info_elements.h429
-rw-r--r--vnet/vnet/flow/ipfix_packet.h188
-rw-r--r--vnet/vnet/global_funcs.h24
-rw-r--r--vnet/vnet/gre/error.def23
-rw-r--r--vnet/vnet/gre/gre.c512
-rw-r--r--vnet/vnet/gre/gre.h118
-rw-r--r--vnet/vnet/gre/interface.c150
-rw-r--r--vnet/vnet/gre/node.c533
-rw-r--r--vnet/vnet/gre/packet.h54
-rw-r--r--vnet/vnet/gre/pg.c77
-rw-r--r--vnet/vnet/hdlc/error.def42
-rw-r--r--vnet/vnet/hdlc/hdlc.c250
-rw-r--r--vnet/vnet/hdlc/hdlc.h127
-rw-r--r--vnet/vnet/hdlc/node.c351
-rw-r--r--vnet/vnet/hdlc/packet.h72
-rw-r--r--vnet/vnet/hdlc/pg.c105
-rw-r--r--vnet/vnet/interface.c1051
-rw-r--r--vnet/vnet/interface.h490
-rw-r--r--vnet/vnet/interface_cli.c769
-rw-r--r--vnet/vnet/interface_format.c333
-rw-r--r--vnet/vnet/interface_funcs.h212
-rw-r--r--vnet/vnet/interface_output.c1311
-rw-r--r--vnet/vnet/ip/format.c119
-rw-r--r--vnet/vnet/ip/format.h89
-rw-r--r--vnet/vnet/ip/icmp4.c734
-rw-r--r--vnet/vnet/ip/icmp46_packet.h392
-rw-r--r--vnet/vnet/ip/icmp6.c814
-rw-r--r--vnet/vnet/ip/icmp6.h70
-rw-r--r--vnet/vnet/ip/igmp_packet.h140
-rw-r--r--vnet/vnet/ip/ip.h222
-rw-r--r--vnet/vnet/ip/ip4.h409
-rw-r--r--vnet/vnet/ip/ip46_cli.c158
-rw-r--r--vnet/vnet/ip/ip4_error.h86
-rw-r--r--vnet/vnet/ip/ip4_format.c243
-rw-r--r--vnet/vnet/ip/ip4_forward.c3564
-rw-r--r--vnet/vnet/ip/ip4_hop_by_hop.c320
-rw-r--r--vnet/vnet/ip/ip4_input.c423
-rw-r--r--vnet/vnet/ip/ip4_mtrie.c561
-rw-r--r--vnet/vnet/ip/ip4_mtrie.h161
-rw-r--r--vnet/vnet/ip/ip4_packet.h314
-rw-r--r--vnet/vnet/ip/ip4_pg.c387
-rw-r--r--vnet/vnet/ip/ip4_source_check.c369
-rw-r--r--vnet/vnet/ip/ip4_test.c311
-rw-r--r--vnet/vnet/ip/ip6.h503
-rw-r--r--vnet/vnet/ip/ip6_error.h85
-rw-r--r--vnet/vnet/ip/ip6_format.c322
-rw-r--r--vnet/vnet/ip/ip6_forward.c2724
-rw-r--r--vnet/vnet/ip/ip6_hop_by_hop.c1139
-rw-r--r--vnet/vnet/ip/ip6_hop_by_hop.h35
-rw-r--r--vnet/vnet/ip/ip6_hop_by_hop_packet.h67
-rw-r--r--vnet/vnet/ip/ip6_input.c317
-rw-r--r--vnet/vnet/ip/ip6_neighbor.c3146
-rw-r--r--vnet/vnet/ip/ip6_packet.h378
-rw-r--r--vnet/vnet/ip/ip6_pg.c222
-rw-r--r--vnet/vnet/ip/ip_checksum.c184
-rw-r--r--vnet/vnet/ip/ip_frag.c449
-rw-r--r--vnet/vnet/ip/ip_frag.h81
-rw-r--r--vnet/vnet/ip/ip_init.c153
-rw-r--r--vnet/vnet/ip/ip_input_acl.c394
-rw-r--r--vnet/vnet/ip/ip_packet.h183
-rw-r--r--vnet/vnet/ip/lookup.c2271
-rw-r--r--vnet/vnet/ip/lookup.h442
-rw-r--r--vnet/vnet/ip/ports.def757
-rw-r--r--vnet/vnet/ip/protocols.def162
-rw-r--r--vnet/vnet/ip/tcp.c2983
-rw-r--r--vnet/vnet/ip/tcp.h396
-rw-r--r--vnet/vnet/ip/tcp_format.c132
-rw-r--r--vnet/vnet/ip/tcp_init.c65
-rw-r--r--vnet/vnet/ip/tcp_packet.h118
-rw-r--r--vnet/vnet/ip/tcp_pg.c224
-rw-r--r--vnet/vnet/ip/udp.h113
-rw-r--r--vnet/vnet/ip/udp_error.def20
-rw-r--r--vnet/vnet/ip/udp_format.c83
-rw-r--r--vnet/vnet/ip/udp_init.c63
-rw-r--r--vnet/vnet/ip/udp_local.c508
-rw-r--r--vnet/vnet/ip/udp_packet.h56
-rw-r--r--vnet/vnet/ip/udp_pg.c233
-rw-r--r--vnet/vnet/ipsec/esp.h140
-rw-r--r--vnet/vnet/ipsec/esp_decrypt.c424
-rw-r--r--vnet/vnet/ipsec/esp_encrypt.c386
-rw-r--r--vnet/vnet/ipsec/ikev2.c2142
-rw-r--r--vnet/vnet/ipsec/ikev2.h383
-rw-r--r--vnet/vnet/ipsec/ikev2_cli.c437
-rw-r--r--vnet/vnet/ipsec/ikev2_crypto.c753
-rw-r--r--vnet/vnet/ipsec/ikev2_format.c155
-rw-r--r--vnet/vnet/ipsec/ikev2_payload.c492
-rw-r--r--vnet/vnet/ipsec/ikev2_priv.h282
-rw-r--r--vnet/vnet/ipsec/ipsec.c535
-rw-r--r--vnet/vnet/ipsec/ipsec.h292
-rw-r--r--vnet/vnet/ipsec/ipsec_cli.c710
-rw-r--r--vnet/vnet/ipsec/ipsec_format.c133
-rw-r--r--vnet/vnet/ipsec/ipsec_if.c199
-rw-r--r--vnet/vnet/ipsec/ipsec_if_in.c151
-rw-r--r--vnet/vnet/ipsec/ipsec_if_out.c140
-rw-r--r--vnet/vnet/ipsec/ipsec_input.c406
-rw-r--r--vnet/vnet/ipsec/ipsec_output.c405
-rw-r--r--vnet/vnet/l2/feat_bitmap.c166
-rw-r--r--vnet/vnet/l2/feat_bitmap.h80
-rw-r--r--vnet/vnet/l2/l2_bd.c695
-rw-r--r--vnet/vnet/l2/l2_bd.h120
-rw-r--r--vnet/vnet/l2/l2_bvi.c35
-rw-r--r--vnet/vnet/l2/l2_bvi.h122
-rw-r--r--vnet/vnet/l2/l2_classify.c551
-rw-r--r--vnet/vnet/l2/l2_classify.h76
-rw-r--r--vnet/vnet/l2/l2_efp_filter.c572
-rw-r--r--vnet/vnet/l2/l2_efp_filter.h28
-rw-r--r--vnet/vnet/l2/l2_fib.c567
-rw-r--r--vnet/vnet/l2/l2_fib.h226
-rw-r--r--vnet/vnet/l2/l2_flood.c520
-rw-r--r--vnet/vnet/l2/l2_flood.h28
-rw-r--r--vnet/vnet/l2/l2_fwd.c446
-rw-r--r--vnet/vnet/l2/l2_fwd.h29
-rw-r--r--vnet/vnet/l2/l2_input.c963
-rw-r--r--vnet/vnet/l2/l2_input.h279
-rw-r--r--vnet/vnet/l2/l2_input_acl.c427
-rw-r--r--vnet/vnet/l2/l2_input_vtr.c314
-rw-r--r--vnet/vnet/l2/l2_input_vtr.h43
-rw-r--r--vnet/vnet/l2/l2_learn.c504
-rw-r--r--vnet/vnet/l2/l2_learn.h47
-rw-r--r--vnet/vnet/l2/l2_output.c541
-rw-r--r--vnet/vnet/l2/l2_output.h219
-rw-r--r--vnet/vnet/l2/l2_output_acl.c335
-rw-r--r--vnet/vnet/l2/l2_patch.c432
-rw-r--r--vnet/vnet/l2/l2_vtr.c448
-rw-r--r--vnet/vnet/l2/l2_vtr.h167
-rw-r--r--vnet/vnet/l2/l2_xcrw.c559
-rw-r--r--vnet/vnet/l2/l2_xcrw.h78
-rw-r--r--vnet/vnet/l2tp/decap.c253
-rw-r--r--vnet/vnet/l2tp/encap.c217
-rw-r--r--vnet/vnet/l2tp/l2tp.c696
-rw-r--r--vnet/vnet/l2tp/l2tp.h131
-rw-r--r--vnet/vnet/l2tp/packet.h33
-rw-r--r--vnet/vnet/l2tp/pg.c94
-rw-r--r--vnet/vnet/l3_types.h50
-rw-r--r--vnet/vnet/lawful-intercept/lawful_intercept.c111
-rw-r--r--vnet/vnet/lawful-intercept/lawful_intercept.h46
-rw-r--r--vnet/vnet/lawful-intercept/node.c272
-rw-r--r--vnet/vnet/lisp-gpe/decap.c298
-rw-r--r--vnet/vnet/lisp-gpe/encap.c299
-rw-r--r--vnet/vnet/lisp-gpe/lisp_gpe.c498
-rw-r--r--vnet/vnet/lisp-gpe/lisp_gpe.h137
-rw-r--r--vnet/vnet/lisp-gpe/lisp_gpe_error.def16
-rw-r--r--vnet/vnet/lisp-gpe/lisp_gpe_packet.h123
-rw-r--r--vnet/vnet/lisp-gpe/rfc.txt826
-rw-r--r--vnet/vnet/llc/llc.c230
-rw-r--r--vnet/vnet/llc/llc.h183
-rw-r--r--vnet/vnet/llc/node.c308
-rw-r--r--vnet/vnet/llc/pg.c104
-rwxr-xr-xvnet/vnet/map/examples/gen-rules.py213
-rwxr-xr-xvnet/vnet/map/examples/map-test.py214
-rw-r--r--vnet/vnet/map/examples/mapalgs.py327
-rw-r--r--vnet/vnet/map/examples/mt-test.py80
-rwxr-xr-xvnet/vnet/map/gen-rules.py107
-rw-r--r--vnet/vnet/map/ip4_map.c591
-rw-r--r--vnet/vnet/map/ip4_map_t.c1092
-rw-r--r--vnet/vnet/map/ip4_sixrd.c127
-rw-r--r--vnet/vnet/map/ip6_map.c966
-rw-r--r--vnet/vnet/map/ip6_map_t.c1141
-rw-r--r--vnet/vnet/map/ip6_sixrd.c129
-rw-r--r--vnet/vnet/map/map.c1634
-rw-r--r--vnet/vnet/map/map.h556
-rw-r--r--vnet/vnet/map/map_doc.md69
-rw-r--r--vnet/vnet/map/sixrd.c355
-rw-r--r--vnet/vnet/map/sixrd.h144
-rw-r--r--vnet/vnet/mcast/mcast.c563
-rw-r--r--vnet/vnet/mcast/mcast.h50
-rw-r--r--vnet/vnet/mcast/mcast_test.c149
-rw-r--r--vnet/vnet/misc.c93
-rw-r--r--vnet/vnet/mpls-gre/error.def28
-rw-r--r--vnet/vnet/mpls-gre/interface.c1930
-rw-r--r--vnet/vnet/mpls-gre/mpls.c769
-rw-r--r--vnet/vnet/mpls-gre/mpls.h231
-rw-r--r--vnet/vnet/mpls-gre/node.c359
-rw-r--r--vnet/vnet/mpls-gre/packet.h49
-rw-r--r--vnet/vnet/mpls-gre/pg.c71
-rw-r--r--vnet/vnet/mpls-gre/policy_encap.c172
-rw-r--r--vnet/vnet/nsh-gre/decap.c365
-rw-r--r--vnet/vnet/nsh-gre/encap.c303
-rw-r--r--vnet/vnet/nsh-gre/nsh_gre.c537
-rw-r--r--vnet/vnet/nsh-gre/nsh_gre.h123
-rw-r--r--vnet/vnet/nsh-gre/nsh_gre_error.def17
-rw-r--r--vnet/vnet/nsh-gre/nsh_gre_packet.h93
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/decap.c365
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/encap.c349
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.c562
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h152
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def16
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/vxlan-gpe-rfc.txt868
-rw-r--r--vnet/vnet/nsh-vxlan-gpe/vxlan_gpe_packet.h74
-rw-r--r--vnet/vnet/osi/node.c302
-rw-r--r--vnet/vnet/osi/osi.c186
-rw-r--r--vnet/vnet/osi/osi.h163
-rw-r--r--vnet/vnet/osi/pg.c98
-rw-r--r--vnet/vnet/pg/cli.c438
-rw-r--r--vnet/vnet/pg/edit.c179
-rw-r--r--vnet/vnet/pg/edit.h198
-rw-r--r--vnet/vnet/pg/example.script6
-rw-r--r--vnet/vnet/pg/init.c78
-rw-r--r--vnet/vnet/pg/input.c1745
-rw-r--r--vnet/vnet/pg/output.c52
-rw-r--r--vnet/vnet/pg/pg.h347
-rw-r--r--vnet/vnet/pg/stream.c420
-rw-r--r--vnet/vnet/pipeline.h453
-rw-r--r--vnet/vnet/plugin/p1.c117
-rw-r--r--vnet/vnet/plugin/plugin.h32
-rw-r--r--vnet/vnet/policer/fix_types.h31
-rw-r--r--vnet/vnet/policer/node_funcs.c451
-rw-r--r--vnet/vnet/policer/police.h178
-rw-r--r--vnet/vnet/policer/policer.c366
-rw-r--r--vnet/vnet/policer/policer.h60
-rw-r--r--vnet/vnet/policer/xlate.c1380
-rw-r--r--vnet/vnet/policer/xlate.h141
-rw-r--r--vnet/vnet/ppp/error.def42
-rw-r--r--vnet/vnet/ppp/node.c337
-rw-r--r--vnet/vnet/ppp/packet.h189
-rw-r--r--vnet/vnet/ppp/pg.c105
-rw-r--r--vnet/vnet/ppp/ppp.c250
-rw-r--r--vnet/vnet/ppp/ppp.h127
-rw-r--r--vnet/vnet/replication.c276
-rw-r--r--vnet/vnet/replication.h123
-rw-r--r--vnet/vnet/rewrite.c270
-rw-r--r--vnet/vnet/rewrite.h281
-rw-r--r--vnet/vnet/snap/node.c343
-rw-r--r--vnet/vnet/snap/pg.c108
-rw-r--r--vnet/vnet/snap/snap.c189
-rw-r--r--vnet/vnet/snap/snap.h196
-rw-r--r--vnet/vnet/sr/rfc_draft_05.txt1265
-rw-r--r--vnet/vnet/sr/sr.c2228
-rw-r--r--vnet/vnet/sr/sr.h132
-rw-r--r--vnet/vnet/sr/sr_error.def20
-rw-r--r--vnet/vnet/sr/sr_fix_dst_error.def17
-rw-r--r--vnet/vnet/sr/sr_packet.h227
-rw-r--r--vnet/vnet/srp/format.c147
-rw-r--r--vnet/vnet/srp/interface.c458
-rw-r--r--vnet/vnet/srp/node.c929
-rw-r--r--vnet/vnet/srp/packet.h204
-rw-r--r--vnet/vnet/srp/pg.c157
-rw-r--r--vnet/vnet/srp/srp.h222
-rw-r--r--vnet/vnet/unix/gdb_funcs.c117
-rw-r--r--vnet/vnet/unix/pcap.c213
-rw-r--r--vnet/vnet/unix/pcap.h187
-rw-r--r--vnet/vnet/unix/pcap2pg.c155
-rw-r--r--vnet/vnet/unix/tapcli.c1200
-rw-r--r--vnet/vnet/unix/tapcli.h29
-rw-r--r--vnet/vnet/unix/tuntap.c907
-rw-r--r--vnet/vnet/unix/tuntap.h37
-rw-r--r--vnet/vnet/vcgn/cgn_bitmap.h133
-rw-r--r--vnet/vnet/vcgn/cgse_defs.h88
-rw-r--r--vnet/vnet/vcgn/cnat_bulk_port.c964
-rw-r--r--vnet/vnet/vcgn/cnat_bulk_port.h157
-rw-r--r--vnet/vnet/vcgn/cnat_bulk_port_defs.h57
-rw-r--r--vnet/vnet/vcgn/cnat_cli.h206
-rw-r--r--vnet/vnet/vcgn/cnat_cli_handler.c947
-rw-r--r--vnet/vnet/vcgn/cnat_common_api.h22
-rw-r--r--vnet/vnet/vcgn/cnat_config.c77
-rw-r--r--vnet/vnet/vcgn/cnat_config.h582
-rw-r--r--vnet/vnet/vcgn/cnat_config_api.h46
-rw-r--r--vnet/vnet/vcgn/cnat_db.h701
-rw-r--r--vnet/vnet/vcgn/cnat_db_scanner.c493
-rw-r--r--vnet/vnet/vcgn/cnat_db_v2.c3716
-rw-r--r--vnet/vnet/vcgn/cnat_debug_msg_handler.c1780
-rw-r--r--vnet/vnet/vcgn/cnat_global.c79
-rw-r--r--vnet/vnet/vcgn/cnat_global.h87
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp.h51
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp_error_inside_input.c476
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp_error_outside_input.c452
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input.c404
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input_exception.c235
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_icmp_query_outside_input.c381
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input.c424
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input_exceptions.c314
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_tcp_outside_input.c382
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_udp.h41
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_udp_inside_input.c508
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_udp_inside_input_exceptions.c283
-rw-r--r--vnet/vnet/vcgn/cnat_ipv4_udp_outside_input.c605
-rw-r--r--vnet/vnet/vcgn/cnat_log_api.h114
-rw-r--r--vnet/vnet/vcgn/cnat_log_common.h79
-rw-r--r--vnet/vnet/vcgn/cnat_logging.c3566
-rw-r--r--vnet/vnet/vcgn/cnat_logging.h1091
-rw-r--r--vnet/vnet/vcgn/cnat_pcp_server.h398
-rw-r--r--vnet/vnet/vcgn/cnat_ports.c1113
-rw-r--r--vnet/vnet/vcgn/cnat_ports.h208
-rw-r--r--vnet/vnet/vcgn/cnat_show.c807
-rw-r--r--vnet/vnet/vcgn/cnat_show_api.h40
-rw-r--r--vnet/vnet/vcgn/cnat_show_response.h580
-rw-r--r--vnet/vnet/vcgn/cnat_syslog.c1787
-rw-r--r--vnet/vnet/vcgn/cnat_syslog.h190
-rw-r--r--vnet/vnet/vcgn/cnat_util.c2257
-rw-r--r--vnet/vnet/vcgn/cnat_v4_ftp_alg.h133
-rw-r--r--vnet/vnet/vcgn/cnat_v4_functions.c364
-rw-r--r--vnet/vnet/vcgn/cnat_v4_functions.h342
-rw-r--r--vnet/vnet/vcgn/cnat_v4_pptp_alg.h150
-rw-r--r--vnet/vnet/vcgn/cnat_v4_tcp_in2out_stages.c679
-rw-r--r--vnet/vnet/vcgn/cnat_va_db.c286
-rw-r--r--vnet/vnet/vcgn/cnat_va_db.h121
-rw-r--r--vnet/vnet/vcgn/dslite_db.h170
-rw-r--r--vnet/vnet/vcgn/dslite_defs.h336
-rw-r--r--vnet/vnet/vcgn/index_list.c336
-rw-r--r--vnet/vnet/vcgn/index_list.h118
-rw-r--r--vnet/vnet/vcgn/nat64_db.h480
-rw-r--r--vnet/vnet/vcgn/nat64_defs.h576
-rw-r--r--vnet/vnet/vcgn/nat64_tcp_sm.h91
-rw-r--r--vnet/vnet/vcgn/platform_common.h136
-rw-r--r--vnet/vnet/vcgn/platform_common_override.h304
-rw-r--r--vnet/vnet/vcgn/spp_ctx.h76
-rw-r--r--vnet/vnet/vcgn/spp_platform_trace_log.c991
-rw-r--r--vnet/vnet/vcgn/spp_platform_trace_log.h357
-rw-r--r--vnet/vnet/vcgn/spp_timers.h139
-rw-r--r--vnet/vnet/vcgn/tcp_header_definitions.h1582
-rw-r--r--vnet/vnet/vcgn/vcgn_classify.c1419
-rw-r--r--vnet/vnet/vcgn/vcgn_db.h117
-rw-r--r--vnet/vnet/vnet.h92
-rw-r--r--vnet/vnet/vxlan/decap.c429
-rw-r--r--vnet/vnet/vxlan/encap.c449
-rw-r--r--vnet/vnet/vxlan/vxlan.c436
-rw-r--r--vnet/vnet/vxlan/vxlan.h128
-rw-r--r--vnet/vnet/vxlan/vxlan_error.def16
-rw-r--r--vnet/vnet/vxlan/vxlan_packet.h66
371 files changed, 158491 insertions, 0 deletions
diff --git a/vnet/vnet/api_errno.h b/vnet/vnet/api_errno.h
new file mode 100644
index 00000000000..b4b55354650
--- /dev/null
+++ b/vnet/vnet/api_errno.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_api_errno_h
+#define included_vnet_api_errno_h
+
+#define foreach_vnet_api_error \
+_(UNSPECIFIED, -1, "Unspecified Error") \
+_(INVALID_SW_IF_INDEX, -2, "Invalid sw_if_index") \
+_(NO_SUCH_FIB, -3, "No such FIB / VRF") \
+_(NO_SUCH_INNER_FIB, -4, "No such inner FIB / VRF") \
+_(NO_SUCH_LABEL, -5, "No such label") \
+_(NO_SUCH_ENTRY, -6, "No such entry") \
+_(INVALID_VALUE, -7, "Invalid value") \
+_(INVALID_VALUE_2, -8, "Invalid value #2") \
+_(UNIMPLEMENTED, -9, "Unimplemented") \
+_(INVALID_SW_IF_INDEX_2, -10, "Invalid sw_if_index #2") \
+_(SYSCALL_ERROR_1, -11, "System call error #1") \
+_(SYSCALL_ERROR_2, -12, "System call error #2") \
+_(SYSCALL_ERROR_3, -13, "System call error #3") \
+_(SYSCALL_ERROR_4, -14, "System call error #4") \
+_(SYSCALL_ERROR_5, -15, "System call error #5") \
+_(SYSCALL_ERROR_6, -16, "System call error #6") \
+_(SYSCALL_ERROR_7, -17, "System call error #7") \
+_(SYSCALL_ERROR_8, -18, "System call error #8") \
+_(SYSCALL_ERROR_9, -19, "System call error #9") \
+_(SYSCALL_ERROR_10, -20, "System call error #9") \
+_(FEATURE_DISABLED, -30, "Feature disabled by configuration") \
+_(INVALID_REGISTRATION, -31, "Invalid registration") \
+_(NEXT_HOP_NOT_IN_FIB, -50, "Next hop not in FIB") \
+_(UNKNOWN_DESTINATION, -51, "Unknown destination") \
+_(PREFIX_MATCHES_NEXT_HOP, -52, "Prefix matches next hop") \
+_(NEXT_HOP_NOT_FOUND_MP, -53, "Next hop not found (multipath)") \
+_(NO_MATCHING_INTERFACE, -54, "No matching interface for probe") \
+_(INVALID_VLAN, -55, "Invalid VLAN") \
+_(VLAN_ALREADY_EXISTS, -56, "VLAN subif already exists") \
+_(INVALID_SRC_ADDRESS, -57, "Invalid src address") \
+_(INVALID_DST_ADDRESS, -58, "Invalid dst address") \
+_(ADDRESS_LENGTH_MISMATCH, -59, "Address length mismatch") \
+_(ADDRESS_NOT_FOUND_FOR_INTERFACE, -60, "Address not found for interface") \
+_(ADDRESS_NOT_LINK_LOCAL, -61, "Address not link-local") \
+_(IP6_NOT_ENABLED, -62, "ip6 not enabled") \
+_(ADDRESS_MATCHES_INTERFACE_ADDRESS, -63, "Address matches interface address") \
+_(IN_PROGRESS, 10, "Operation in progress") \
+_(NO_SUCH_NODE, -63, "No such graph node") \
+_(NO_SUCH_NODE2, -64, "No such graph node #2") \
+_(NO_SUCH_TABLE, -65, "No such table") \
+_(NO_SUCH_TABLE2, -66, "No such table #2") \
+_(NO_SUCH_TABLE3, -67, "No such table #3") \
+_(SUBIF_ALREADY_EXISTS, -68, "Subinterface already exists") \
+_(SUBIF_CREATE_FAILED, -69, "Subinterface creation failed") \
+_(INVALID_MEMORY_SIZE, -70, "Invalid memory size requested") \
+_(INVALID_INTERFACE, -71, "Invalid interface") \
+_(INVALID_VLAN_TAG_COUNT, -72, "Invalid number of tags for requested operation") \
+_(INVALID_ARGUMENT, -73, "Invalid argument") \
+_(UNEXPECTED_INTF_STATE, -74, "Unexpected interface state") \
+_(TUNNEL_EXIST, -75, "Tunnel already exists") \
+_(INVALID_DECAP_NEXT, -76, "Invalid decap-next") \
+_(RESPONSE_NOT_READY, -77, "Response not ready") \
+_(NOT_CONNECTED, -78, "Not connected to the data plane")
+
+typedef enum {
+#define _(a,b,c) VNET_API_ERROR_##a = (b),
+ foreach_vnet_api_error
+#undef _
+ VNET_API_N_ERROR,
+} vnet_api_error_t;
+
+#endif /* included_vnet_api_errno_h */
diff --git a/vnet/vnet/buffer.h b/vnet/vnet/buffer.h
new file mode 100644
index 00000000000..9cbb402bd60
--- /dev/null
+++ b/vnet/vnet/buffer.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * vnet/buffer.h: vnet buffer flags
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_buffer_h
+#define included_vnet_buffer_h
+
+#include <vlib/vlib.h>
+
+/* VLIB buffer flags for ip4/ip6 packets. Set by input interfaces for ip4/ip6
+ tcp/udp packets with hardware computed checksums. */
+#define LOG2_IP_BUFFER_L4_CHECKSUM_COMPUTED LOG2_VLIB_BUFFER_FLAG_USER(1)
+#define LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT LOG2_VLIB_BUFFER_FLAG_USER(2)
+#define IP_BUFFER_L4_CHECKSUM_COMPUTED (1 << LOG2_IP_BUFFER_L4_CHECKSUM_COMPUTED)
+#define IP_BUFFER_L4_CHECKSUM_CORRECT (1 << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT)
+
+#define LOG2_HGSHM_BUFFER_USER_INDEX_VALID LOG2_VLIB_BUFFER_FLAG_USER(3)
+#define VNET_HGSHM_BUFFER_USER_INDEX_VALID (1 << LOG2_HGSHM_BUFFER_USER_INDEX_VALID)
+
+#define foreach_buffer_opaque_union_subtype \
+_(ethernet) \
+_(ip) \
+_(mcast) \
+_(lb) \
+_(dlb) \
+_(swt) \
+_(l2) \
+_(l2t) \
+_(hgshm) \
+_(gre) \
+_(l2_classify) \
+_(io_handoff) \
+_(policer) \
+_(output_features)
+
+/*
+ * vnet stack buffer opaque array overlay structure.
+ * The vnet_buffer_opaque_t *must* be the same size as the
+ * vlib_buffer_t "opaque" structure member, 32 bytes.
+ *
+ * When adding a union type, please add a stanza to
+ * foreach_buffer_opaque_union_subtype (directly above).
+ * Code in vnet_interface_init(...) verifies the size
+ * of the union, and will announce any deviations in an
+ * impossible-to-miss manner.
+ */
+typedef struct {
+ u32 sw_if_index[VLIB_N_RX_TX];
+
+ union {
+ /* Ethernet. */
+ struct {
+ /* Saved value of current header by ethernet-input. */
+ i32 start_of_ethernet_header;
+ } ethernet;
+
+ /* IP4/6 buffer opaque. */
+ struct {
+ /* Adjacency from destination IP address lookup [VLIB_TX].
+ Adjacency from source IP address lookup [VLIB_RX].
+ This gets set to ~0 until source lookup is performed. */
+ u32 adj_index[VLIB_N_RX_TX];
+
+ union {
+ struct {
+ /* Current configuration index. */
+ u32 current_config_index;
+
+ /* Flow hash value for this packet computed from IP src/dst address
+ protocol and ports. */
+ u32 flow_hash;
+
+ /* next protocol */
+ u32 save_protocol;
+ };
+
+ /* Alternate used for local TCP packets. */
+ struct {
+ u32 listener_index;
+
+ u32 established_connection_index;
+
+ u32 mini_connection_index;
+ } tcp;
+ };
+ } ip;
+
+ /* Multicast replication */
+ struct {
+ u32 pad[3];
+ u32 mcast_group_index;
+ u32 mcast_current_index;
+ u32 original_free_list_index;
+ } mcast;
+
+ /* ipv6 shallow-pkt-inspection load-balancer, only valid there */
+ struct {
+ u8 lb_disable;
+ u8 user_to_network;
+ u8 was_injected;
+ u32 bucket_id;
+ } lb;
+ /* ipv4 DPI load-balancer, only valid there */
+ struct {
+ u8 lb_disable;
+ u8 user_to_network;
+ u32 session_index;
+ } dlb;
+
+ /* ip4-in-ip6 softwire termination, only valid there */
+ struct {
+ u8 swt_disable;
+ u32 mapping_index;
+ } swt;
+
+ /* l2 bridging path, only valid there */
+ struct {
+ u32 feature_bitmap;
+ u16 bd_index; // bridge-domain index
+ u8 l2_len; // ethernet header length
+ u8 shg; // split-horizon group
+ } l2;
+
+ /* l2tpv3 softwire encap, only valid there */
+ struct {
+ u32 pad[4]; /* do not overlay w/ ip.adj_index[0,1] */
+ u8 next_index;
+ u32 session_index;
+ } l2t;
+
+ /* hgshm, valid if packet sent through iface */
+ struct {
+ u32 pad[8 -VLIB_N_RX_TX -1]; /* to end of opaque */
+ u32 user_index; /* client id borrowing buffer */
+ } hgshm;
+
+ struct {
+ u32 src, dst;
+ } gre;
+
+ /* L2 classify */
+ struct {
+ u64 pad;
+ u32 opaque_index;
+ u32 table_index;
+ u64 hash;
+ } l2_classify;
+
+ /* IO - worker thread handoff */
+ struct {
+ u32 next_index;
+ } io_handoff;
+
+ /* vnet policer */
+ struct {
+ u32 pad[8 -VLIB_N_RX_TX -1]; /* to end of opaque */
+ u32 index;
+ } policer;
+
+ /* interface output features */
+ struct {
+ u32 ipsec_spd_index;
+ u32 ipsec_sad_index;
+ u32 unused[3];
+ u32 bitmap;
+ } output_features;
+
+ /* vcgn udp inside input, only valid there */
+ struct {
+ /* This part forms context of the packet. The structure should be
+ * exactly same as spp_ctx_t. Also this should be the first
+ * element of this vcgn_uii structure.
+ */
+ /****** BEGIN spp_ctx_t section ***********************/
+ union { /* Roddick specific */
+ u32 roddick_info;
+ struct _tx_pkt_info { /* Used by PI to PI communication for TX */
+ u32 uidb_index:16; /* uidb_index to transmit */
+ u32 packet_type:2; /* 1-IPv4, 2-Ipv6, - 0,3 - Unused */
+ u32 ipv4_defrag:1; /* 0 - Normal, 1 - update first
+ * segment size
+ * (set by 6rd defrag node)
+ */
+
+ u32 dst_ip_port_idx:4;/* Index to dst_ip_port_table */
+ u32 from_node:4;
+ u32 calc_chksum:1;
+ u32 reserved:4;
+ } tx;
+ struct _rx_pkt_info { /* Used by PD / PI communication */
+ u32 uidb_index:16; /* uidb_index received in packet */
+ u32 packet_type:2; /* 1-IPv4, 2-Ipv6, - 0,3 - Unused */
+ u32 icmp_type:1; /* 0-ICMP query type, 1-ICMP error type */
+ u32 protocol_type:2; /* 1-TCP, 2-UDP, 3-ICMP, 0 - Unused */
+ u32 ipv4_defrag:1; /* 0 - Normal, 1 - update first
+ * segment size
+ * (set by 6rd defrag node)
+ */
+
+ u32 direction:1; /* 0-Outside, 1-Inside */
+ u32 frag:1; /*IP fragment-1, Otherwise-0*/
+ u32 option:1; /* 0-No IP option (v4) present, non-fragHdr
+ * option hdr present (v6)
+ */
+ u32 df_bit:1; /* IPv4 DF bit copied here */
+ u32 reserved1:6;
+ } rx;
+ } ru;
+ /****** END spp_ctx_t section ***********************/
+
+ union {
+ struct {
+ u32 ipv4;
+ u16 port;
+ u16 vrf; //bit0-13:i/f, bit14-15:protocol
+ } k;
+
+ u64 key64;
+ } key;
+
+ u32 bucket;
+
+ u16 ovrf; /* Exit interface */
+ u8 frag_pkt;
+ u8 vcgn_unused1;
+ } vcgn_uii;
+
+ /* MAP */
+ struct {
+ u16 mtu;
+ } map;
+
+ /* MAP-T */
+ struct {
+ u32 map_domain_index;
+ struct {
+ u32 saddr, daddr;
+ u16 frag_offset; //Fragmentation header offset
+ u16 l4_offset; //L4 header overall offset
+ u8 l4_protocol; //The final protocol number
+ } v6; //Used by ip6_map_t only
+ u16 checksum_offset; //L4 checksum overall offset
+ u16 mtu; //Exit MTU
+ } map_t;
+
+ /* IP Fragmentation */
+ struct {
+ u16 header_offset;
+ u16 mtu;
+ u8 next_index;
+ u8 flags; //See ip_frag.h
+ } ip_frag;
+
+ u32 unused[6];
+ };
+} vnet_buffer_opaque_t;
+
+#define vnet_buffer(b) ((vnet_buffer_opaque_t *) (b)->opaque)
+
+/* Full cache line (64 bytes) of additional space */
+typedef struct {
+ union {
+ };
+} vnet_buffer_opaque2_t;
+
+
+
+#endif /* included_vnet_buffer_h */
diff --git a/vnet/vnet/classify/README b/vnet/vnet/classify/README
new file mode 100644
index 00000000000..1ef5ab5ac34
--- /dev/null
+++ b/vnet/vnet/classify/README
@@ -0,0 +1,180 @@
+=== vnet classifier theory of operation ===
+
+The vnet classifier trades off simplicity and perf / scale
+characteristics. At a certain level, it's a dumb robot. Given an
+incoming packet, search an ordered list of (mask, match) tables. If
+the classifier finds a matching entry, take the indicated action. If
+not, take a last-resort action.
+
+We use the MMX-unit to match or hash 16 octets at a time. For hardware
+backward compatibility, the code does not [currently] use 256-bit
+(32-octet) vector instructions.
+
+Effective use of the classifier centers around building table lists
+which "hit" as soon as practicable. In many cases, established
+sessions hit in the first table. In this mode of operation, the
+classifier easily processes multiple MPPS / core - even with millions
+of sessions in the data base. Searching 357 tables on a regular basis
+will neatly solve the halting problem.
+
+==== Basic operation ====
+
+The classifier mask-and-match operation proceeds as follows. Given a
+starting classifier table index, lay hands on the indicated mask
+vector. When building tables, we arrange for the mask to obey
+mmx-unit (16-octet) alignment.
+
+We know that the first octet of packet data starts on a cache-line
+boundary. Further, it's reasonably likely that folks won't want to use
+the generalized classifier on the L2 header; preferring to decode the
+Ethertype manually. That scheme makes it easy to select among ip4 /
+ip6 / MPLS, etc. classifier table sets.
+
+A no-vlan-tag L2 header is 14 octets long. A typical ipv4 header
+begins with the octets 0x4500: version=4, header_length=5, DSCP=0,
+ECN=0. If one doesn't intend to classify on (DSCP, ECN) - the typical
+case - we program the classifier to skip the first 16-octet vector.
+
+To classify untagged ipv4 packets on source address, we program the
+classifier to skip one vector, and mask-and-match one vector.
+
+The basic match-and-match operation looks like this:
+
+ switch (t->match_n_vectors)
+ {
+ case 1:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ break;
+
+ case 2:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ break;
+
+ <etc>
+ }
+
+ result_mask = u32x4_zero_byte_mask (result);
+ if (result_mask == 0xffff)
+ return (v);
+
+Net of setup, it costs a couple of clock cycles to mask-and-match 16
+octets.
+
+At the risk of belaboring an obvious point, the control-plane
+'''must''' pay attention to detail. When skipping one (or more)
+vectors, masks and matches must reflect that decision. See
+.../vnet/vnet/classify/vnet_classify.c:unformat_classify_[mask|match]. Note
+that vec_validate (xxx, 13) creates a 14-element vector.
+
+==== Creating a classifier table ====
+
+To create a new classifier table via the control-plane API, send a
+"classify_add_del_table" message. The underlying action routine,
+vnet_classify_add_del_table(...), is located in
+.../vnet/vnet/classify/vnet_classify.c, and has the following
+prototype:
+
+ int vnet_classify_add_del_table (vnet_classify_main_t * cm,
+ u8 * mask,
+ u32 nbuckets,
+ u32 memory_size,
+ u32 skip,
+ u32 match,
+ u32 next_table_index,
+ u32 miss_next_index,
+ u32 * table_index,
+ int is_add)
+
+Pass cm = &vnet_classify_main if calling this routine directly. Mask,
+skip(_n_vectors) and match(_n_vectors) are as described above. Mask
+need not be aligned, but it must be match*16 octets in length. To
+avoid having your head explode, be absolutely certain that '''only'''
+the bits you intend to match on are set.
+
+The classifier uses thread-safe, no-reader-locking-required
+bounded-index extensible hashing. Nbuckets is the [fixed] size of the
+hash bucket vector. The algorithm works in constant time regardless of
+hash collisions, but wastes space when the bucket array is too
+small. A good rule of thumb: let nbuckets = approximate number of
+entries expected.
+
+At a signficant cost in complexity, it would be possible to resize the
+bucket array dynamically. We have no plans to implement that function.
+
+Each classifier table has its own clib mheap memory allocation
+arena. To pick the memory_size parameter, note that each classifier
+table entry needs 16*(1 + match_n_vectors) bytes. Within reason, aim a
+bit high. Clib mheap memory uses o/s level virtual memory - not wired
+or hugetlb memory - so it's best not to scrimp on size.
+
+The "next_table_index" parameter is as described: the pool index in
+vnet_classify_main.tables of the next table to search. Code ~0 to
+indicate the end of the table list. 0 is a valid table index!
+
+We often create classification tables in reverse order -
+last-table-searched to first-table-searched - so we can easily set
+this parameter. Of course, one can manually adjust the data structure
+after-the-fact.
+
+Specific classifier client nodes - for example,
+.../vnet/vnet/classify/ip_classify.c - interpret the "miss_next_index"
+parameter as a vpp graph-node next index. When packet classification
+fails to produce a match, ip_classify_inline sends packets to the
+indicated disposition. A classifier application might program this
+parameter to send packets which don't match an existing session to a
+"first-sign-of-life, create-new-session" node.
+
+Finally, the is_add parameter indicates whether to add or delete the
+indicated table. The delete case implicitly terminates all sessions
+with extreme prejudice, by freeing the specified clib mheap.
+
+==== Creating a classifier session ====
+
+To create a new classifier session via the control-plane API, send a
+"classify_add_del_session" message. The underlying action routine,
+vnet_classify_add_del_session(...), is located in
+.../vnet/vnet/classify/vnet_classify.c, and has the following
+prototype:
+
+int vnet_classify_add_del_session (vnet_classify_main_t * cm,
+ u32 table_index,
+ u8 * match,
+ u32 hit_next_index,
+ u32 opaque_index,
+ i32 advance,
+ int is_add)
+
+Pass cm = &vnet_classify_main if calling this routine directly. Table
+index specifies the table which receives the new session / contains
+the session to delete depending on is_add.
+
+Match is the key for the indicated session. It need not be aligned,
+but it must be table->match_n_vectors*16 octets in length. As a
+courtesy, vnet_classify_add_del_session applies the table's mask to
+the stored key-value. In this way, one can create a session by passing
+unmasked (packet_data + offset) as the "match" parameter, and end up
+with unconfusing session keys.
+
+Specific classifier client nodes - for example,
+.../vnet/vnet/classify/ip_classify.c - interpret the per-session
+hit_next_index parameter as a vpp graph-node next index. When packet
+classification produces a match, ip_classify_inline sends packets to
+the indicated disposition.
+
+ip4/6_classify place the per-session opaque_index parameter into
+vnet_buffer(b)->l2_classify.opaque_index; a slight misnomer, but
+anyhow classifier applications can send session-hit packets to
+specific graph nodes, with useful values in buffer metadata. Depending
+on the required semantics, we send known-session traffic to a certain
+node, with e.g. a session pool index in buffer metadata. It's totally
+up to the control-plane and the specific use-case.
+
+Finally, nodes such as ip4/6-classify apply the advance parameter as a
+[signed!] argument to vlib_buffer_advance(...); to "consume" a
+networking layer. Example: if we classify incoming tunneled IP packets
+by (inner) source/dest address and source/dest port, we might choose
+to decapsulate and reencapsulate the inner packet. In such a case,
+program the advance parameter to perform the tunnel decapsulation, and
+program next_index to send traffic to a node which uses
+e.g. opaque_index to output traffic on a specific tunnel interface.
diff --git a/vnet/vnet/classify/input_acl.c b/vnet/vnet/classify/input_acl.c
new file mode 100644
index 00000000000..2c533d1170c
--- /dev/null
+++ b/vnet/vnet/classify/input_acl.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/classify/input_acl.h>
+
+input_acl_main_t input_acl_main;
+
+static int
+vnet_inacl_ip_feature_enable (vlib_main_t * vnm,
+ input_acl_main_t *am,
+ u32 sw_if_index,
+ input_acl_table_id_t tid,
+ int feature_enable)
+{
+
+ if (tid == INPUT_ACL_TABLE_L2)
+ {
+ l2input_intf_bitmap_enable (sw_if_index, L2INPUT_FEAT_ACL,
+ feature_enable);
+ }
+ else
+ { /* IP[46] */
+ ip_lookup_main_t * lm;
+ ip_config_main_t * ipcm;
+ ip4_rx_feature_type_t ftype;
+ u32 ci;
+
+ if (tid == INPUT_ACL_TABLE_IP4)
+ {
+ lm = &ip4_main.lookup_main;
+ ftype = IP4_RX_FEATURE_CHECK_ACCESS;
+ }
+ else
+ {
+ lm = &ip6_main.lookup_main;
+ ftype = IP6_RX_FEATURE_CHECK_ACCESS;
+ }
+
+ ipcm = &lm->rx_config_mains[VNET_UNICAST];
+
+ ci = ipcm->config_index_by_sw_if_index[sw_if_index];
+ ci = ((feature_enable)
+ ? vnet_config_add_feature
+ : vnet_config_del_feature)
+ (vnm, &ipcm->config_main, ci, ftype,
+ /* config data */ 0,
+ /* # bytes of config data */ 0);
+
+ ipcm->config_index_by_sw_if_index[sw_if_index] = ci;
+ am->vnet_config_main[tid] = &ipcm->config_main;
+ }
+
+ return 0;
+}
+
+int vnet_set_input_acl_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 ip4_table_index,
+ u32 ip6_table_index,
+ u32 l2_table_index, u32 is_add)
+{
+ input_acl_main_t * am = &input_acl_main;
+ vnet_classify_main_t * vcm = am->vnet_classify_main;
+ u32 acl[INPUT_ACL_N_TABLES] = {ip4_table_index, ip6_table_index,
+ l2_table_index};
+ u32 ti;
+
+ /* Assume that we've validated sw_if_index in the API layer */
+
+ for (ti = 0; ti < INPUT_ACL_N_TABLES; ti++)
+ {
+ if (acl[ti] == ~0)
+ continue;
+
+ if (pool_is_free_index (vcm->tables, acl[ti]))
+ return VNET_API_ERROR_NO_SUCH_TABLE;
+
+ vec_validate_init_empty
+ (am->classify_table_index_by_sw_if_index[ti], sw_if_index, ~0);
+
+ /* Reject any DEL operation with wrong sw_if_index */
+ if (!is_add &&
+ (acl[ti] != am->classify_table_index_by_sw_if_index[ti][sw_if_index]))
+ {
+ clib_warning ("Non-existent intf_idx=%d with table_index=%d for delete",
+ sw_if_index, acl[ti]);
+ return VNET_API_ERROR_NO_SUCH_TABLE;
+ }
+
+ /* Return ok on ADD operaton if feature is already enabled */
+ if (is_add &&
+ am->classify_table_index_by_sw_if_index[ti][sw_if_index] != ~0)
+ return 0;
+
+ vnet_inacl_ip_feature_enable (vm, am, sw_if_index, ti, is_add);
+
+ if (is_add)
+ am->classify_table_index_by_sw_if_index[ti][sw_if_index] = acl[ti];
+ else
+ am->classify_table_index_by_sw_if_index[ti][sw_if_index] = ~0;
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+set_input_acl_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index = ~0;
+ u32 ip4_table_index = ~0;
+ u32 ip6_table_index = ~0;
+ u32 l2_table_index = ~0;
+ u32 is_add = 1;
+ u32 idx_cnt = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else if (unformat (input, "ip4-table %d", &ip4_table_index))
+ idx_cnt++;
+ else if (unformat (input, "ip6-table %d", &ip6_table_index))
+ idx_cnt++;
+ else if (unformat (input, "l2-table %d", &l2_table_index))
+ idx_cnt++;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else
+ break;
+ }
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "Interface must be specified.");
+
+ if (!idx_cnt)
+ return clib_error_return (0, "Table index should be specified.");
+
+ if (idx_cnt > 1)
+ return clib_error_return (0, "Only one table index per API is allowed.");
+
+ rv = vnet_set_input_acl_intfc (vm, sw_if_index, ip4_table_index,
+ ip6_table_index, l2_table_index, is_add);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_MATCHING_INTERFACE:
+ return clib_error_return (0, "No such interface");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "No such classifier table");
+ }
+ return 0;
+}
+
+/*
+ * Configure interface to enable/disble input ACL feature:
+ * intfc - interface name to be configured as input ACL
+ * Ip4-table <index> [del] - enable/disable IP4 input ACL
+ * Ip6-table <index> [del] - enable/disable IP6 input ACL
+ * l2-table <index> [del] - enable/disable Layer2 input ACL
+ *
+ * Note: Only one table index per API call is allowed.
+ *
+ */
+VLIB_CLI_COMMAND (set_input_acl_command, static) = {
+ .path = "set interface input acl",
+ .short_help =
+ "set interface input acl intfc <int> [ip4-table <index>]\n"
+ " [ip6-table <index>] [l2-table <index>] [del]",
+ .function = set_input_acl_command_fn,
+};
+
+clib_error_t *input_acl_init (vlib_main_t *vm)
+{
+ input_acl_main_t * am = &input_acl_main;
+ clib_error_t * error = 0;
+
+ if ((error = vlib_call_init_function (vm, ip_inacl_init)))
+ return error;
+
+ am->vlib_main = vm;
+ am->vnet_main = vnet_get_main();
+ am->vnet_classify_main = &vnet_classify_main;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (input_acl_init);
+
+uword unformat_acl_type (unformat_input_t * input, va_list * args)
+{
+ u32 * acl_type = va_arg (*args, u32 *);
+ u32 tid = INPUT_ACL_N_TABLES;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "ip4"))
+ tid = INPUT_ACL_TABLE_IP4;
+ else if (unformat (input, "ip6"))
+ tid = INPUT_ACL_TABLE_IP6;
+ else if (unformat (input, "l2"))
+ tid = INPUT_ACL_TABLE_L2;
+ else
+ break;
+ }
+
+ *acl_type = tid;
+ return 1;
+}
+
+u8 * format_vnet_inacl_info (u8 * s, va_list * va)
+{
+ input_acl_main_t * am = va_arg (*va, input_acl_main_t *);
+ int sw_if_idx = va_arg (*va, int);
+ u32 tid = va_arg (*va, u32);
+
+ if (tid == ~0)
+ {
+ s = format (s, "%10s%20s\t\t%s", "Intfc idx", "Classify table",
+ "Interface name");
+ return s;
+ }
+
+ s = format (s, "%10d%20d\t\t%U", sw_if_idx, tid,
+ format_vnet_sw_if_index_name, am->vnet_main, sw_if_idx);
+
+ return s;
+}
+
+static clib_error_t *
+show_inacl_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ input_acl_main_t * am = &input_acl_main;
+ u32 type = INPUT_ACL_N_TABLES;
+ int i;
+ u32 * vec_tbl;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "type %U", unformat_acl_type, &type))
+ ;
+ else
+ break;
+ }
+
+ if (type == INPUT_ACL_N_TABLES)
+ return clib_error_return (0, "Invalid input ACL table type.");
+
+ vec_tbl = am->classify_table_index_by_sw_if_index[type];
+
+ if (vec_len(vec_tbl))
+ vlib_cli_output (vm, "%U", format_vnet_inacl_info, am, ~0 /* hdr */, ~0);
+ else
+ vlib_cli_output (vm, "No input ACL tables configured");
+
+ for (i = 0; i < vec_len (vec_tbl); i++)
+ {
+ if (vec_elt(vec_tbl, i) == ~0)
+ continue;
+
+ vlib_cli_output (vm, "%U", format_vnet_inacl_info,
+ am, i, vec_elt(vec_tbl, i));
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_inacl_command, static) = {
+ .path = "show inacl",
+ .short_help = "show inacl type [ip4|ip6|l2]",
+ .function = show_inacl_command_fn,
+};
diff --git a/vnet/vnet/classify/input_acl.h b/vnet/vnet/classify/input_acl.h
new file mode 100644
index 00000000000..7ffc189f053
--- /dev/null
+++ b/vnet/vnet/classify/input_acl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_vnet_input_acl_h__
+#define __included_vnet_input_acl_h__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/classify/vnet_classify.h>
+
+typedef enum {
+ INPUT_ACL_TABLE_IP4,
+ INPUT_ACL_TABLE_IP6,
+ INPUT_ACL_TABLE_L2,
+ INPUT_ACL_N_TABLES,
+} input_acl_table_id_t;
+
+typedef enum {
+ ACL_NEXT_INDEX_DENY,
+ ACL_NEXT_INDEX_N_NEXT,
+} acl_next_index_t;
+
+typedef struct {
+
+ /* classifier table vectors */
+ u32 * classify_table_index_by_sw_if_index [INPUT_ACL_N_TABLES];
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ vnet_classify_main_t * vnet_classify_main;
+ vnet_config_main_t * vnet_config_main [INPUT_ACL_N_TABLES];
+} input_acl_main_t;
+
+extern input_acl_main_t input_acl_main;
+
+int vnet_set_input_acl_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 ip4_table_index,
+ u32 ip6_table_index,
+ u32 l2_table_index, u32 is_add);
+
+#endif /* __included_vnet_input_acl_h__ */
diff --git a/vnet/vnet/classify/ip_classify.c b/vnet/vnet/classify/ip_classify.c
new file mode 100644
index 00000000000..c922608547c
--- /dev/null
+++ b/vnet/vnet/classify/ip_classify.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */
+#include <vnet/classify/vnet_classify.h>
+
+typedef struct {
+ u32 next_index;
+ u32 table_index;
+ u32 entry_index;
+} ip_classify_trace_t;
+
+/* packet trace format function */
+static u8 * format_ip_classify_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip_classify_trace_t * t = va_arg (*args, ip_classify_trace_t *);
+
+ s = format (s, "IP_CLASSIFY: next_index %d, table %d, entry %d",
+ t->next_index, t->table_index, t->entry_index);
+ return s;
+}
+
+vlib_node_registration_t ip4_classify_node;
+vlib_node_registration_t ip6_classify_node;
+
+#define foreach_ip_classify_error \
+_(MISS, "Classify misses") \
+_(HIT, "Classify hits") \
+_(CHAIN_HIT, "Classify hits after chain walk")
+
+typedef enum {
+#define _(sym,str) IP_CLASSIFY_ERROR_##sym,
+ foreach_ip_classify_error
+#undef _
+ IP_CLASSIFY_N_ERROR,
+} ip_classify_error_t;
+
+static char * ip_classify_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip_classify_error
+#undef _
+};
+
+static inline uword
+ip_classify_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame, int is_ip4)
+{
+ u32 n_left_from, * from, * to_next;
+ ip_lookup_next_t next_index;
+ vnet_classify_main_t * vcm = &vnet_classify_main;
+ ip_lookup_main_t * lm;
+ f64 now = vlib_time_now (vm);
+ u32 hits = 0;
+ u32 misses = 0;
+ u32 chain_hits = 0;
+
+ if (is_ip4)
+ lm = &ip4_main.lookup_main;
+ else
+ lm = &ip6_main.lookup_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ /* First pass: compute hashes */
+
+ while (n_left_from > 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, bi1;
+ u8 * h0, * h1;
+ u32 adj_index0, adj_index1;
+ ip_adjacency_t * adj0, * adj1;
+ u32 table_index0, table_index1;
+ vnet_classify_table_t * t0, * t1;
+
+ /* prefetch next iteration */
+ {
+ vlib_buffer_t * p1, * p2;
+
+ p1 = vlib_get_buffer (vm, from[1]);
+ p2 = vlib_get_buffer (vm, from[2]);
+
+ vlib_prefetch_buffer_header (p1, STORE);
+ CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE);
+ vlib_prefetch_buffer_header (p2, STORE);
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ bi1 = from[1];
+ b1 = vlib_get_buffer (vm, bi1);
+ h1 = b1->data;
+
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ table_index0 = adj0->classify_table_index;
+
+ adj_index1 = vnet_buffer (b1)->ip.adj_index[VLIB_TX];
+ adj1 = ip_get_adjacency (lm, adj_index1);
+ table_index1 = adj1->classify_table_index;
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ t1 = pool_elt_at_index (vcm->tables, table_index1);
+
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ vnet_buffer(b1)->l2_classify.hash =
+ vnet_classify_hash_packet (t1, (u8 *) h1);
+
+ vnet_classify_prefetch_bucket (t1, vnet_buffer(b1)->l2_classify.hash);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+
+ vnet_buffer(b1)->l2_classify.table_index = table_index1;
+
+ from += 2;
+ n_left_from -= 2;
+ }
+
+ while (n_left_from > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0;
+ u8 * h0;
+ u32 adj_index0;
+ ip_adjacency_t * adj0;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ table_index0 = adj0->classify_table_index;
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ from++;
+ n_left_from--;
+ }
+
+ next_index = node->cached_next_index;
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ /* Not enough load/store slots to dual loop... */
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = IP_LOOKUP_NEXT_MISS;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+ vnet_classify_entry_t * e0;
+ u64 hash0;
+ u8 * h0;
+
+ /* Stride 3 seems to work best */
+ if (PREDICT_TRUE (n_left_from > 3))
+ {
+ vlib_buffer_t * p1 = vlib_get_buffer(vm, from[3]);
+ vnet_classify_table_t * tp1;
+ u32 table_index1;
+ u64 phash1;
+
+ table_index1 = vnet_buffer(p1)->l2_classify.table_index;
+
+ if (PREDICT_TRUE (table_index1 != ~0))
+ {
+ tp1 = pool_elt_at_index (vcm->tables, table_index1);
+ phash1 = vnet_buffer(p1)->l2_classify.hash;
+ vnet_classify_prefetch_entry (tp1, phash1);
+ }
+ }
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+ table_index0 = vnet_buffer(b0)->l2_classify.table_index;
+ e0 = 0;
+ t0 = 0;
+ vnet_buffer(b0)->l2_classify.opaque_index = ~0;
+
+ if (PREDICT_TRUE(table_index0 != ~0))
+ {
+ hash0 = vnet_buffer(b0)->l2_classify.hash;
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0,
+ now);
+ if (e0)
+ {
+ vnet_buffer(b0)->l2_classify.opaque_index
+ = e0->opaque_index;
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < IP_LOOKUP_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ }
+ else
+ {
+ while (1)
+ {
+ if (t0->next_table_index != ~0)
+ t0 = pool_elt_at_index (vcm->tables,
+ t0->next_table_index);
+ else
+ {
+ next0 = (t0->miss_next_index < IP_LOOKUP_N_NEXT)?
+ t0->miss_next_index:next0;
+ misses++;
+ break;
+ }
+
+ hash0 = vnet_classify_hash_packet (t0, (u8 *) h0);
+ e0 = vnet_classify_find_entry
+ (t0, (u8 *) h0, hash0, now);
+ if (e0)
+ {
+ vnet_buffer(b0)->l2_classify.opaque_index
+ = e0->opaque_index;
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < IP_LOOKUP_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ chain_hits++;
+ break;
+ }
+ }
+ }
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip_classify_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ t->table_index = t0 ? t0 - vcm->tables : ~0;
+ t->entry_index = e0 ? e0 - t0->entries : ~0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_CLASSIFY_ERROR_MISS,
+ misses);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_CLASSIFY_ERROR_HIT,
+ hits);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_CLASSIFY_ERROR_CHAIN_HIT,
+ chain_hits);
+ return frame->n_vectors;
+}
+
+static uword
+ip4_classify (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip_classify_inline (vm, node, frame, 1 /* is_ip4 */);
+}
+
+
+VLIB_REGISTER_NODE (ip4_classify_node) = {
+ .function = ip4_classify,
+ .name = "ip4-classify",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_classify_trace,
+ .n_errors = ARRAY_LEN(ip_classify_error_strings),
+ .error_strings = ip_classify_error_strings,
+
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", /* probably not... */
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+static uword
+ip6_classify (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip_classify_inline (vm, node, frame, 0 /* is_ip4 */);
+}
+
+
+VLIB_REGISTER_NODE (ip6_classify_node) = {
+ .function = ip6_classify,
+ .name = "ip6-classify",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_classify_trace,
+ .n_errors = ARRAY_LEN(ip_classify_error_strings),
+ .error_strings = ip_classify_error_strings,
+
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip6-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip6-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", /* probably not... */
+ [IP_LOOKUP_NEXT_MAP] = "ip6-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop",
+ },
+};
+
+static clib_error_t *
+ip_classify_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip_classify_init);
diff --git a/vnet/vnet/classify/vnet_classify.c b/vnet/vnet/classify/vnet_classify.c
new file mode 100644
index 00000000000..43acb024033
--- /dev/null
+++ b/vnet/vnet/classify/vnet_classify.c
@@ -0,0 +1,1895 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/classify/input_acl.h>
+#include <vnet/ip/ip.h>
+#include <vnet/api_errno.h> /* for API error numbers */
+#include <vnet/l2/l2_classify.h> /* for L2_CLASSIFY_NEXT_xxx */
+
+#if VALIDATION_SCAFFOLDING
+/* Validation scaffolding */
+void mv (vnet_classify_table_t * t)
+{
+ void * oldheap;
+
+ oldheap = clib_mem_set_heap (t->mheap);
+ clib_mem_validate();
+ clib_mem_set_heap (oldheap);
+}
+
+void rogue (vnet_classify_table_t * t)
+{
+ int i, j, k;
+ vnet_classify_entry_t * v, * save_v;
+ u32 active_elements = 0;
+ vnet_classify_bucket_t * b;
+
+ for (i = 0; i < t->nbuckets; i++)
+ {
+ b = &t->buckets [i];
+ if (b->offset == 0)
+ continue;
+ save_v = vnet_classify_get_entry (t, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < t->entries_per_page; k++)
+ {
+ v = vnet_classify_entry_at_index
+ (t, save_v, j*t->entries_per_page + k);
+
+ if (vnet_classify_entry_is_busy (v))
+ active_elements++;
+ }
+ }
+ }
+
+ if (active_elements != t->active_elements)
+ clib_warning ("found %u expected %u elts", active_elements,
+ t->active_elements);
+}
+#else
+void mv (vnet_classify_table_t * t) { }
+void rogue (vnet_classify_table_t * t) { }
+#endif
+
+vnet_classify_table_t *
+vnet_classify_new_table (vnet_classify_main_t *cm,
+ u8 * mask, u32 nbuckets, u32 memory_size,
+ u32 skip_n_vectors,
+ u32 match_n_vectors)
+{
+ vnet_classify_table_t * t;
+ void * oldheap;
+
+ nbuckets = 1 << (max_log2 (nbuckets));
+
+ pool_get_aligned (cm->tables, t, CLIB_CACHE_LINE_BYTES);
+ memset(t, 0, sizeof (*t));
+
+ vec_validate_aligned (t->mask, match_n_vectors - 1, sizeof(u32x4));
+ memcpy (t->mask, mask, match_n_vectors * sizeof (u32x4));
+
+ t->next_table_index = ~0;
+ t->nbuckets = nbuckets;
+ t->log2_nbuckets = max_log2 (nbuckets);
+ t->match_n_vectors = match_n_vectors;
+ t->skip_n_vectors = skip_n_vectors;
+ t->entries_per_page = 2;
+
+ t->mheap = mheap_alloc (0 /* use VM */, memory_size);
+
+ vec_validate_aligned (t->buckets, nbuckets - 1, CLIB_CACHE_LINE_BYTES);
+ oldheap = clib_mem_set_heap (t->mheap);
+
+ t->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+
+ clib_mem_set_heap (oldheap);
+ return (t);
+}
+
+void vnet_classify_delete_table_index (vnet_classify_main_t *cm,
+ u32 table_index)
+{
+ vnet_classify_table_t * t;
+
+ /* Tolerate multiple frees, up to a point */
+ if (pool_is_free_index (cm->tables, table_index))
+ return;
+
+ t = pool_elt_at_index (cm->tables, table_index);
+ if (t->next_table_index != ~0)
+ vnet_classify_delete_table_index (cm, t->next_table_index);
+
+ vec_free (t->mask);
+ vec_free (t->buckets);
+ mheap_free (t->mheap);
+
+ pool_put (cm->tables, t);
+}
+
+static vnet_classify_entry_t *
+vnet_classify_entry_alloc (vnet_classify_table_t * t, u32 log2_pages)
+{
+ vnet_classify_entry_t * rv = 0;
+#define _(size) \
+ vnet_classify_entry_##size##_t * rv##size = 0;
+ foreach_size_in_u32x4;
+#undef _
+
+ void * oldheap;
+
+ ASSERT (t->writer_lock[0]);
+ if (log2_pages >= vec_len (t->freelists) || t->freelists [log2_pages] == 0)
+ {
+ oldheap = clib_mem_set_heap (t->mheap);
+
+ vec_validate (t->freelists, log2_pages);
+
+ switch(t->match_n_vectors)
+ {
+ /* Euchre the vector allocator into allocating the right sizes */
+#define _(size) \
+ case size: \
+ vec_validate_aligned \
+ (rv##size, ((1<<log2_pages)*t->entries_per_page) - 1, \
+ CLIB_CACHE_LINE_BYTES); \
+ rv = (vnet_classify_entry_t *) rv##size; \
+ break;
+ foreach_size_in_u32x4;
+#undef _
+
+ default:
+ abort();
+ }
+
+ clib_mem_set_heap (oldheap);
+ goto initialize;
+ }
+ rv = t->freelists[log2_pages];
+ t->freelists[log2_pages] = rv->next_free;
+
+initialize:
+ ASSERT(rv);
+ ASSERT (vec_len(rv) == (1<<log2_pages)*t->entries_per_page);
+
+ switch (t->match_n_vectors)
+ {
+#define _(size) \
+ case size: \
+ if(vec_len(rv)) \
+ memset (rv, 0xff, sizeof (*rv##size) * vec_len(rv)); \
+ break;
+ foreach_size_in_u32x4;
+#undef _
+
+ default:
+ abort();
+ }
+
+ return rv;
+}
+
+static void
+vnet_classify_entry_free (vnet_classify_table_t * t,
+ vnet_classify_entry_t * v)
+{
+ u32 free_list_index;
+
+ ASSERT (t->writer_lock[0]);
+
+ free_list_index = min_log2(vec_len(v)/t->entries_per_page);
+
+ ASSERT(vec_len (t->freelists) > free_list_index);
+
+ v->next_free = t->freelists[free_list_index];
+ t->freelists[free_list_index] = v;
+}
+
+static inline void make_working_copy
+(vnet_classify_table_t * t, vnet_classify_bucket_t * b)
+{
+ vnet_classify_entry_t * v;
+ vnet_classify_bucket_t working_bucket __attribute__((aligned (8)));
+ void * oldheap;
+ vnet_classify_entry_t * working_copy;
+#define _(size) \
+ vnet_classify_entry_##size##_t * working_copy##size = 0;
+ foreach_size_in_u32x4;
+#undef _
+ u32 cpu_number = os_get_cpu_number();
+
+ if (cpu_number >= vec_len (t->working_copies))
+ {
+ oldheap = clib_mem_set_heap (t->mheap);
+ vec_validate (t->working_copies, cpu_number);
+ clib_mem_set_heap (oldheap);
+ }
+
+ /*
+ * working_copies are per-cpu so that near-simultaneous
+ * updates from multiple threads will not result in sporadic, spurious
+ * lookup failures.
+ */
+ working_copy = t->working_copies[cpu_number];
+
+ t->saved_bucket.as_u64 = b->as_u64;
+ oldheap = clib_mem_set_heap (t->mheap);
+
+ if ((1<<b->log2_pages)*t->entries_per_page > vec_len (working_copy))
+ {
+ switch(t->match_n_vectors)
+ {
+ /* Euchre the vector allocator into allocating the right sizes */
+#define _(size) \
+ case size: \
+ working_copy##size = (void *) working_copy; \
+ vec_validate_aligned \
+ (working_copy##size, \
+ ((1<<b->log2_pages)*t->entries_per_page) - 1, \
+ CLIB_CACHE_LINE_BYTES); \
+ working_copy = (void *) working_copy##size; \
+ break;
+ foreach_size_in_u32x4;
+#undef _
+
+ default:
+ abort();
+ }
+ t->working_copies[cpu_number] = working_copy;
+ }
+
+ _vec_len(working_copy) = (1<<b->log2_pages)*t->entries_per_page;
+ clib_mem_set_heap (oldheap);
+
+ v = vnet_classify_get_entry (t, b->offset);
+
+ switch(t->match_n_vectors)
+ {
+#define _(size) \
+ case size: \
+ memcpy (working_copy, v, \
+ sizeof (vnet_classify_entry_##size##_t) \
+ * (1<<b->log2_pages) \
+ * (t->entries_per_page)); \
+ break;
+ foreach_size_in_u32x4 ;
+#undef _
+
+ default:
+ abort();
+ }
+
+ working_bucket.as_u64 = b->as_u64;
+ working_bucket.offset = vnet_classify_get_offset (t, working_copy);
+ CLIB_MEMORY_BARRIER();
+ b->as_u64 = working_bucket.as_u64;
+ t->working_copies[cpu_number] = working_copy;
+}
+
+static vnet_classify_entry_t *
+split_and_rehash (vnet_classify_table_t * t,
+ vnet_classify_entry_t * old_values,
+ u32 new_log2_pages)
+{
+ vnet_classify_entry_t * new_values, * v, * new_v;
+ int i, j, k;
+
+ new_values = vnet_classify_entry_alloc (t, new_log2_pages);
+
+ for (i = 0; i < (vec_len (old_values)/t->entries_per_page); i++)
+ {
+ u64 new_hash;
+
+ for (j = 0; j < t->entries_per_page; j++)
+ {
+ v = vnet_classify_entry_at_index
+ (t, old_values, i * t->entries_per_page + j);
+
+ if (vnet_classify_entry_is_busy (v))
+ {
+ /* Hack so we can use the packet hash routine */
+ u8 * key_minus_skip;
+ key_minus_skip = (u8 *) v->key;
+ key_minus_skip -= t->skip_n_vectors * sizeof (u32x4);
+
+ new_hash = vnet_classify_hash_packet (t, key_minus_skip);
+ new_hash >>= t->log2_nbuckets;
+ new_hash &= (1<<new_log2_pages) - 1;
+
+ for (k = 0; k < t->entries_per_page; k++)
+ {
+ new_v = vnet_classify_entry_at_index (t, new_values,
+ new_hash + k);
+
+ if (vnet_classify_entry_is_free (new_v))
+ {
+ memcpy (new_v, v, sizeof (vnet_classify_entry_t)
+ + (t->match_n_vectors * sizeof (u32x4)));
+ new_v->flags &= ~(VNET_CLASSIFY_ENTRY_FREE);
+ goto doublebreak;
+ }
+ }
+ /* Crap. Tell caller to try again */
+ vnet_classify_entry_free (t, new_values);
+ return 0;
+ }
+ doublebreak:
+ ;
+ }
+ }
+ return new_values;
+}
+
+int vnet_classify_add_del (vnet_classify_table_t * t,
+ vnet_classify_entry_t * add_v,
+ int is_add)
+{
+ u32 bucket_index;
+ vnet_classify_bucket_t * b, tmp_b;
+ vnet_classify_entry_t * v, * new_v, * save_new_v, * working_copy, * save_v;
+ u32 value_index;
+ int rv = 0;
+ int i;
+ u64 hash, new_hash;
+ u32 new_log2_pages;
+ u32 cpu_number = os_get_cpu_number();
+ u8 * key_minus_skip;
+
+ ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0);
+
+ key_minus_skip = (u8 *) add_v->key;
+ key_minus_skip -= t->skip_n_vectors * sizeof (u32x4);
+
+ hash = vnet_classify_hash_packet (t, key_minus_skip);
+
+ bucket_index = hash & (t->nbuckets-1);
+ b = &t->buckets[bucket_index];
+
+ hash >>= t->log2_nbuckets;
+
+ while (__sync_lock_test_and_set (t->writer_lock, 1))
+ ;
+
+ /* First elt in the bucket? */
+ if (b->offset == 0)
+ {
+ if (is_add == 0)
+ {
+ rv = -1;
+ goto unlock;
+ }
+
+ v = vnet_classify_entry_alloc (t, 0 /* new_log2_pages */);
+ memcpy (v, add_v, sizeof (vnet_classify_entry_t) +
+ t->match_n_vectors * sizeof (u32x4));
+ v->flags &= ~(VNET_CLASSIFY_ENTRY_FREE);
+
+ tmp_b.as_u64 = 0;
+ tmp_b.offset = vnet_classify_get_offset (t, v);
+
+ b->as_u64 = tmp_b.as_u64;
+ t->active_elements ++;
+
+ goto unlock;
+ }
+
+ make_working_copy (t, b);
+
+ save_v = vnet_classify_get_entry (t, t->saved_bucket.offset);
+ value_index = hash & ((1<<t->saved_bucket.log2_pages)-1);
+
+ if (is_add)
+ {
+ /*
+ * For obvious (in hindsight) reasons, see if we're supposed to
+ * replace an existing key, then look for an empty slot.
+ */
+
+ for (i = 0; i < t->entries_per_page; i++)
+ {
+ v = vnet_classify_entry_at_index (t, save_v, value_index + i);
+
+ if (!memcmp (v->key, add_v->key, t->match_n_vectors * sizeof (u32x4)))
+ {
+ memcpy (v, add_v, sizeof (vnet_classify_entry_t) +
+ t->match_n_vectors * sizeof(u32x4));
+ v->flags &= ~(VNET_CLASSIFY_ENTRY_FREE);
+
+ CLIB_MEMORY_BARRIER();
+ /* Restore the previous (k,v) pairs */
+ b->as_u64 = t->saved_bucket.as_u64;
+ goto unlock;
+ }
+ }
+ for (i = 0; i < t->entries_per_page; i++)
+ {
+ v = vnet_classify_entry_at_index (t, save_v, value_index + i);
+
+ if (vnet_classify_entry_is_free (v))
+ {
+ memcpy (v, add_v, sizeof (vnet_classify_entry_t) +
+ t->match_n_vectors * sizeof(u32x4));
+ v->flags &= ~(VNET_CLASSIFY_ENTRY_FREE);
+ CLIB_MEMORY_BARRIER();
+ b->as_u64 = t->saved_bucket.as_u64;
+ t->active_elements ++;
+ goto unlock;
+ }
+ }
+ /* no room at the inn... split case... */
+ }
+ else
+ {
+ for (i = 0; i < t->entries_per_page; i++)
+ {
+ v = vnet_classify_entry_at_index (t, save_v, value_index + i);
+
+ if (!memcmp (v->key, add_v->key, t->match_n_vectors * sizeof (u32x4)))
+ {
+ memset (v, 0xff, sizeof (vnet_classify_entry_t) +
+ t->match_n_vectors * sizeof(u32x4));
+ v->flags |= VNET_CLASSIFY_ENTRY_FREE;
+ CLIB_MEMORY_BARRIER();
+ b->as_u64 = t->saved_bucket.as_u64;
+ t->active_elements --;
+ goto unlock;
+ }
+ }
+ rv = -3;
+ b->as_u64 = t->saved_bucket.as_u64;
+ goto unlock;
+ }
+
+ new_log2_pages = t->saved_bucket.log2_pages + 1;
+
+ expand_again:
+ working_copy = t->working_copies[cpu_number];
+ new_v = split_and_rehash (t, working_copy, new_log2_pages);
+
+ if (new_v == 0)
+ {
+ new_log2_pages++;
+ goto expand_again;
+ }
+
+ /* Try to add the new entry */
+ save_new_v = new_v;
+
+ key_minus_skip = (u8 *) add_v->key;
+ key_minus_skip -= t->skip_n_vectors * sizeof (u32x4);
+
+ new_hash = vnet_classify_hash_packet_inline (t, key_minus_skip);
+ new_hash >>= t->log2_nbuckets;
+ new_hash &= (1<<min_log2((vec_len(new_v)/t->entries_per_page))) - 1;
+
+ for (i = 0; i < t->entries_per_page; i++)
+ {
+ new_v = vnet_classify_entry_at_index (t, save_new_v, new_hash + i);
+
+ if (vnet_classify_entry_is_free (new_v))
+ {
+ memcpy (new_v, add_v, sizeof (vnet_classify_entry_t) +
+ t->match_n_vectors * sizeof(u32x4));
+ new_v->flags &= ~(VNET_CLASSIFY_ENTRY_FREE);
+ goto expand_ok;
+ }
+ }
+ /* Crap. Try again */
+ new_log2_pages++;
+ vnet_classify_entry_free (t, save_new_v);
+ goto expand_again;
+
+ expand_ok:
+ tmp_b.log2_pages = min_log2 (vec_len (save_new_v)/t->entries_per_page);
+ tmp_b.offset = vnet_classify_get_offset (t, save_new_v);
+ CLIB_MEMORY_BARRIER();
+ b->as_u64 = tmp_b.as_u64;
+ t->active_elements ++;
+ v = vnet_classify_get_entry (t, t->saved_bucket.offset);
+ vnet_classify_entry_free (t, v);
+
+ unlock:
+ CLIB_MEMORY_BARRIER();
+ t->writer_lock[0] = 0;
+
+ return rv;
+}
+
+typedef CLIB_PACKED(struct {
+ ethernet_header_t eh;
+ ip4_header_t ip;
+}) classify_data_or_mask_t;
+
+u64 vnet_classify_hash_packet (vnet_classify_table_t * t, u8 * h)
+{
+ return vnet_classify_hash_packet_inline (t, h);
+}
+
+vnet_classify_entry_t *
+vnet_classify_find_entry (vnet_classify_table_t * t,
+ u8 * h, u64 hash, f64 now)
+{
+ return vnet_classify_find_entry_inline (t, h, hash, now);
+}
+
+static u8 * format_classify_entry (u8 * s, va_list * args)
+ {
+ vnet_classify_table_t * t = va_arg (*args, vnet_classify_table_t *);
+ vnet_classify_entry_t * e = va_arg (*args, vnet_classify_entry_t *);
+
+ s = format
+ (s, "[%u]: next_index %d advance %d opaque %d\n",
+ vnet_classify_get_offset (t, e), e->next_index, e->advance,
+ e->opaque_index);
+
+
+ s = format (s, " k: %U\n", format_hex_bytes, e->key,
+ t->match_n_vectors * sizeof(u32x4));
+
+ if (vnet_classify_entry_is_busy (e))
+ s = format (s, " hits %lld, last_heard %.2f\n",
+ e->hits, e->last_heard);
+ else
+ s = format (s, " entry is free\n");
+ return s;
+ }
+
+u8 * format_classify_table (u8 * s, va_list * args)
+{
+ vnet_classify_table_t * t = va_arg (*args, vnet_classify_table_t *);
+ int verbose = va_arg (*args, int);
+ vnet_classify_bucket_t * b;
+ vnet_classify_entry_t * v, * save_v;
+ int i, j, k;
+ u64 active_elements = 0;
+
+ for (i = 0; i < t->nbuckets; i++)
+ {
+ b = &t->buckets [i];
+ if (b->offset == 0)
+ {
+ if (verbose > 1)
+ s = format (s, "[%d]: empty\n", i);
+ continue;
+ }
+
+ if (verbose)
+ {
+ s = format (s, "[%d]: heap offset %d, len %d\n", i,
+ b->offset, (1<<b->log2_pages));
+ }
+
+ save_v = vnet_classify_get_entry (t, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < t->entries_per_page; k++)
+ {
+
+ v = vnet_classify_entry_at_index (t, save_v,
+ j*t->entries_per_page + k);
+
+ if (vnet_classify_entry_is_free (v))
+ {
+ if (verbose > 1)
+ s = format (s, " %d: empty\n",
+ j * t->entries_per_page + k);
+ continue;
+ }
+ if (verbose)
+ {
+ s = format (s, " %d: %U\n",
+ j * t->entries_per_page + k,
+ format_classify_entry, t, v);
+ }
+ active_elements++;
+ }
+ }
+ }
+
+ s = format (s, " %lld active elements\n", active_elements);
+ s = format (s, " %d free lists\n", vec_len (t->freelists));
+ return s;
+}
+
+int vnet_classify_add_del_table (vnet_classify_main_t * cm,
+ u8 * mask,
+ u32 nbuckets,
+ u32 memory_size,
+ u32 skip,
+ u32 match,
+ u32 next_table_index,
+ u32 miss_next_index,
+ u32 * table_index,
+ int is_add)
+{
+ vnet_classify_table_t * t;
+
+ if (is_add)
+ {
+ *table_index = ~0;
+ if (memory_size == 0)
+ return VNET_API_ERROR_INVALID_MEMORY_SIZE;
+
+ if (nbuckets == 0)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ t = vnet_classify_new_table (cm, mask, nbuckets, memory_size,
+ skip, match);
+ t->next_table_index = next_table_index;
+ t->miss_next_index = miss_next_index;
+ *table_index = t - cm->tables;
+ return 0;
+ }
+
+ vnet_classify_delete_table_index (cm, *table_index);
+ return 0;
+}
+
+#define foreach_ip4_proto_field \
+_(src_address) \
+_(dst_address) \
+_(tos) \
+_(length) \
+_(fragment_id) \
+_(ttl) \
+_(protocol) \
+_(checksum)
+
+uword unformat_ip4_mask (unformat_input_t * input, va_list * args)
+{
+ u8 ** maskp = va_arg (*args, u8 **);
+ u8 * mask = 0;
+ u8 found_something = 0;
+ ip4_header_t * ip;
+
+#define _(a) u8 a=0;
+ foreach_ip4_proto_field;
+#undef _
+ u8 version = 0;
+ u8 hdr_length = 0;
+
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "version"))
+ version = 1;
+ else if (unformat (input, "hdr_length"))
+ hdr_length = 1;
+ else if (unformat (input, "src"))
+ src_address = 1;
+ else if (unformat (input, "dst"))
+ dst_address = 1;
+ else if (unformat (input, "proto"))
+ protocol = 1;
+
+#define _(a) else if (unformat (input, #a)) a=1;
+ foreach_ip4_proto_field
+#undef _
+ else
+ break;
+ }
+
+#define _(a) found_something += a;
+ foreach_ip4_proto_field;
+#undef _
+
+ if (found_something == 0)
+ return 0;
+
+ vec_validate (mask, sizeof (*ip) - 1);
+
+ ip = (ip4_header_t *) mask;
+
+#define _(a) if (a) memset (&ip->a, 0xff, sizeof (ip->a));
+ foreach_ip4_proto_field;
+#undef _
+
+ ip->ip_version_and_header_length = 0;
+
+ if (version)
+ ip->ip_version_and_header_length |= 0xF0;
+
+ if (hdr_length)
+ ip->ip_version_and_header_length |= 0x0F;
+
+ *maskp = mask;
+ return 1;
+}
+
+#define foreach_ip6_proto_field \
+_(src_address) \
+_(dst_address) \
+_(payload_length) \
+_(hop_limit) \
+_(protocol)
+
+uword unformat_ip6_mask (unformat_input_t * input, va_list * args)
+{
+ u8 ** maskp = va_arg (*args, u8 **);
+ u8 * mask = 0;
+ u8 found_something = 0;
+ ip6_header_t * ip;
+ u32 ip_version_traffic_class_and_flow_label;
+
+#define _(a) u8 a=0;
+ foreach_ip6_proto_field;
+#undef _
+ u8 version = 0;
+ u8 traffic_class = 0;
+ u8 flow_label = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "version"))
+ version = 1;
+ else if (unformat (input, "traffic-class"))
+ traffic_class = 1;
+ else if (unformat (input, "flow-label"))
+ flow_label = 1;
+ else if (unformat (input, "src"))
+ src_address = 1;
+ else if (unformat (input, "dst"))
+ dst_address = 1;
+ else if (unformat (input, "proto"))
+ protocol = 1;
+
+#define _(a) else if (unformat (input, #a)) a=1;
+ foreach_ip6_proto_field
+#undef _
+ else
+ break;
+ }
+
+#define _(a) found_something += a;
+ foreach_ip6_proto_field;
+#undef _
+
+ if (found_something == 0)
+ return 0;
+
+ vec_validate (mask, sizeof (*ip) - 1);
+
+ ip = (ip6_header_t *) mask;
+
+#define _(a) if (a) memset (&ip->a, 0xff, sizeof (ip->a));
+ foreach_ip6_proto_field;
+#undef _
+
+ ip_version_traffic_class_and_flow_label = 0;
+
+ if (version)
+ ip_version_traffic_class_and_flow_label |= 0xF0000000;
+
+ if (traffic_class)
+ ip_version_traffic_class_and_flow_label |= 0x0FF00000;
+
+ if (flow_label)
+ ip_version_traffic_class_and_flow_label |= 0x000FFFFF;
+
+ ip->ip_version_traffic_class_and_flow_label =
+ clib_host_to_net_u32 (ip_version_traffic_class_and_flow_label);
+
+ *maskp = mask;
+ return 1;
+}
+
+uword unformat_l3_mask (unformat_input_t * input, va_list * args)
+{
+ u8 ** maskp = va_arg (*args, u8 **);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "ip4 %U", unformat_ip4_mask, maskp))
+ return 1;
+ else if (unformat (input, "ip6 %U", unformat_ip6_mask, maskp))
+ return 1;
+ else
+ break;
+ }
+ return 0;
+}
+
+uword unformat_l2_mask (unformat_input_t * input, va_list * args)
+{
+ u8 ** maskp = va_arg (*args, u8 **);
+ u8 * mask = 0;
+ u8 src = 0;
+ u8 dst = 0;
+ u8 proto = 0;
+ u8 tag1 = 0;
+ u8 tag2 = 0;
+ u8 ignore_tag1 = 0;
+ u8 ignore_tag2 = 0;
+ u8 cos1 = 0;
+ u8 cos2 = 0;
+ u8 dot1q = 0;
+ u8 dot1ad = 0;
+ int len = 14;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "src"))
+ src = 1;
+ else if (unformat (input, "dst"))
+ dst = 1;
+ else if (unformat (input, "proto"))
+ proto = 1;
+ else if (unformat (input, "tag1"))
+ tag1 = 1;
+ else if (unformat (input, "tag2"))
+ tag2 = 1;
+ else if (unformat (input, "ignore-tag1"))
+ ignore_tag1 = 1;
+ else if (unformat (input, "ignore-tag2"))
+ ignore_tag2 = 1;
+ else if (unformat (input, "cos1"))
+ cos1 = 1;
+ else if (unformat (input, "cos2"))
+ cos2 = 1;
+ else if (unformat (input, "dot1q"))
+ dot1q = 1;
+ else if (unformat (input, "dot1ad"))
+ dot1ad = 1;
+ else
+ break;
+ }
+ if ((src + dst + proto + tag1 + tag2 + dot1q + dot1ad +
+ ignore_tag1 + ignore_tag2 + cos1 + cos2) == 0)
+ return 0;
+
+ if (tag1 || ignore_tag1 || cos1 || dot1q)
+ len = 18;
+ if (tag2 || ignore_tag2 || cos2 || dot1ad)
+ len = 22;
+
+ vec_validate (mask, len-1);
+
+ if (dst)
+ memset (mask, 0xff, 6);
+
+ if (src)
+ memset (mask + 6, 0xff, 6);
+
+ if (tag2 || dot1ad)
+ {
+ /* inner vlan tag */
+ if (tag2)
+ {
+ mask[19] = 0xff;
+ mask[18] = 0x0f;
+ }
+ if (cos2)
+ mask[18] |= 0xe0;
+ if (proto)
+ mask[21] = mask [20] = 0xff;
+ if (tag1)
+ {
+ mask [15] = 0xff;
+ mask [14] = 0x0f;
+ }
+ if (cos1)
+ mask[14] |= 0xe0;
+ *maskp = mask;
+ return 1;
+ }
+ if (tag1 | dot1q)
+ {
+ if (tag1)
+ {
+ mask [15] = 0xff;
+ mask [14] = 0x0f;
+ }
+ if (cos1)
+ mask[14] |= 0xe0;
+ if (proto)
+ mask[16] = mask [17] = 0xff;
+ *maskp = mask;
+ return 1;
+ }
+ if (cos2)
+ mask[18] |= 0xe0;
+ if (cos1)
+ mask[14] |= 0xe0;
+ if (proto)
+ mask[12] = mask [13] = 0xff;
+
+ *maskp = mask;
+ return 1;
+}
+
+uword unformat_classify_mask (unformat_input_t * input, va_list * args)
+{
+ vnet_classify_main_t * CLIB_UNUSED(cm)
+ = va_arg (*args, vnet_classify_main_t *);
+ u8 ** maskp = va_arg (*args, u8 **);
+ u32 * skipp = va_arg (*args, u32 *);
+ u32 * matchp = va_arg (*args, u32 *);
+ u32 match;
+ u8 * mask = 0;
+ u8 * l2 = 0;
+ u8 * l3 = 0;
+ int i;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "hex %U", unformat_hex_string, &mask))
+ ;
+ else if (unformat (input, "l2 %U", unformat_l2_mask, &l2))
+ ;
+ else if (unformat (input, "l3 %U", unformat_l3_mask, &l3))
+ ;
+ else
+ break;
+ }
+
+ if (mask || l2 || l3)
+ {
+ if (l2 || l3)
+ {
+ /* "With a free Ethernet header in every package" */
+ if (l2 == 0)
+ vec_validate (l2, 13);
+ mask = l2;
+ vec_append (mask, l3);
+ vec_free (l3);
+ }
+
+ /* Scan forward looking for the first significant mask octet */
+ for (i = 0; i < vec_len (mask); i++)
+ if (mask[i])
+ break;
+
+ /* compute (skip, match) params */
+ *skipp = i / sizeof(u32x4);
+ vec_delete (mask, *skipp * sizeof(u32x4), 0);
+
+ /* Pad mask to an even multiple of the vector size */
+ while (vec_len (mask) % sizeof (u32x4))
+ vec_add1 (mask, 0);
+
+ match = vec_len (mask) / sizeof (u32x4);
+
+ for (i = match*sizeof(u32x4); i > 0; i-= sizeof(u32x4))
+ {
+ u64 *tmp = (u64 *)(mask + (i-sizeof(u32x4)));
+ if (*tmp || *(tmp+1))
+ break;
+ match--;
+ }
+ if (match == 0)
+ clib_warning ("BUG: match 0");
+
+ _vec_len (mask) = match * sizeof(u32x4);
+
+ *matchp = match;
+ *maskp = mask;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+#define foreach_l2_next \
+_(drop, DROP) \
+_(ethernet, ETHERNET_INPUT) \
+_(ip4, IP4_INPUT) \
+_(ip6, IP6_INPUT) \
+_(li, LI)
+
+uword unformat_l2_next_index (unformat_input_t * input, va_list * args)
+{
+ u32 * miss_next_indexp = va_arg (*args, u32 *);
+ u32 next_index = 0;
+ u32 tmp;
+
+#define _(n,N) \
+ if (unformat (input, #n)) { next_index = L2_CLASSIFY_NEXT_##N; goto out;}
+ foreach_l2_next;
+#undef _
+
+ if (unformat (input, "%d", &tmp))
+ {
+ next_index = tmp;
+ goto out;
+ }
+
+ return 0;
+
+ out:
+ *miss_next_indexp = next_index;
+ return 1;
+}
+
+#define foreach_ip_next \
+_(miss, MISS) \
+_(drop, DROP) \
+_(local, LOCAL) \
+_(rewrite, REWRITE)
+
+uword unformat_ip_next_index (unformat_input_t * input, va_list * args)
+{
+ u32 * miss_next_indexp = va_arg (*args, u32 *);
+ u32 next_index = 0;
+ u32 tmp;
+
+#define _(n,N) \
+ if (unformat (input, #n)) { next_index = IP_LOOKUP_NEXT_##N; goto out;}
+ foreach_ip_next;
+#undef _
+
+ if (unformat (input, "%d", &tmp))
+ {
+ next_index = tmp;
+ goto out;
+ }
+
+ return 0;
+
+ out:
+ *miss_next_indexp = next_index;
+ return 1;
+}
+
+#define foreach_acl_next \
+_(deny, DENY)
+
+uword unformat_acl_next_index (unformat_input_t * input, va_list * args)
+{
+ u32 * miss_next_indexp = va_arg (*args, u32 *);
+ u32 next_index = 0;
+ u32 tmp;
+
+#define _(n,N) \
+ if (unformat (input, #n)) { next_index = ACL_NEXT_INDEX_##N; goto out;}
+ foreach_acl_next;
+#undef _
+
+ if (unformat (input, "permit"))
+ {
+ next_index = ~0;
+ goto out;
+ }
+ else if (unformat (input, "%d", &tmp))
+ {
+ next_index = tmp;
+ goto out;
+ }
+
+ return 0;
+
+ out:
+ *miss_next_indexp = next_index;
+ return 1;
+}
+
+static clib_error_t *
+classify_table_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 nbuckets = 2;
+ u32 skip = ~0;
+ u32 match = ~0;
+ int is_add = 1;
+ u32 table_index = ~0;
+ u32 next_table_index = ~0;
+ u32 miss_next_index = ~0;
+ u32 memory_size = 2<<20;
+ u32 tmp;
+
+ u8 * mask = 0;
+ vnet_classify_main_t * cm = &vnet_classify_main;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "del"))
+ is_add = 0;
+ else if (unformat (input, "buckets %d", &nbuckets))
+ ;
+ else if (unformat (input, "skip %d", &skip))
+ ;
+ else if (unformat (input, "match %d", &match))
+ ;
+ else if (unformat (input, "table %d", &table_index))
+ ;
+ else if (unformat (input, "mask %U", unformat_classify_mask,
+ cm, &mask, &skip, &match))
+ ;
+ else if (unformat (input, "memory-size %uM", &tmp))
+ memory_size = tmp<<20;
+ else if (unformat (input, "memory-size %uG", &tmp))
+ memory_size = tmp<<30;
+ else if (unformat (input, "next-table %d", &next_table_index))
+ ;
+ else if (unformat (input, "miss-next %U", unformat_ip_next_index,
+ &miss_next_index))
+ ;
+ else if (unformat (input, "l2-miss-next %U", unformat_l2_next_index,
+ &miss_next_index))
+ ;
+ else if (unformat (input, "acl-miss-next %U", unformat_acl_next_index,
+ &miss_next_index))
+ ;
+
+ else
+ break;
+ }
+
+ if (is_add && mask == 0)
+ return clib_error_return (0, "Mask required");
+
+ if (is_add && skip == ~0)
+ return clib_error_return (0, "skip count required");
+
+ if (is_add && match == ~0)
+ return clib_error_return (0, "match count required");
+
+ if (!is_add && table_index == ~0)
+ return clib_error_return (0, "table index required for delete");
+
+ rv = vnet_classify_add_del_table (cm, mask, nbuckets, memory_size,
+ skip, match, next_table_index, miss_next_index,
+ &table_index, is_add);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ default:
+ return clib_error_return (0, "vnet_classify_add_del_table returned %d",
+ rv);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (classify_table, static) = {
+ .path = "classify table",
+ .short_help =
+ "classify table [miss-next|l2-miss_next|acl-miss-next <next_index>]"
+ "\n mask <mask-value> buckets <nn> [skip <n>] [match <n>] [del]",
+ .function = classify_table_command_fn,
+};
+
+static u8 * format_vnet_classify_table (u8 * s, va_list * args)
+{
+ vnet_classify_main_t * cm = va_arg (*args, vnet_classify_main_t *);
+ int verbose = va_arg (*args, int);
+ u32 index = va_arg (*args, u32);
+ vnet_classify_table_t * t;
+
+ if (index == ~0)
+ {
+ s = format (s, "%10s%10s%10s%10s", "TableIdx", "Sessions", "NextTbl",
+ "NextNode", verbose ? "Details" : "");
+ return s;
+ }
+
+ t = pool_elt_at_index (cm->tables, index);
+ s = format (s, "%10u%10d%10d%10d", index, t->active_elements,
+ t->next_table_index, t->miss_next_index);
+
+ s = format (s, "\n Heap: %U", format_mheap, t->mheap, 0 /*verbose*/);
+
+ s = format (s, "\n nbuckets %d, skip %d match %d",
+ t->nbuckets, t->skip_n_vectors, t->match_n_vectors);
+ s = format (s, "\n mask %U", format_hex_bytes, t->mask,
+ t->match_n_vectors * sizeof (u32x4));
+
+ if (verbose == 0)
+ return s;
+
+ s = format (s, "\n%U", format_classify_table, t, verbose);
+
+ return s;
+}
+
+static clib_error_t *
+show_classify_tables_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_classify_main_t * cm = &vnet_classify_main;
+ vnet_classify_table_t * t;
+ u32 match_index = ~0;
+ u32 * indices = 0;
+ int verbose = 0;
+ int i;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "index %d", &match_index))
+ ;
+ else if (unformat (input, "verbose %d", &verbose))
+ ;
+ else if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ break;
+ }
+
+ pool_foreach (t, cm->tables,
+ ({
+ if (match_index == ~0 || (match_index == t - cm->tables))
+ vec_add1 (indices, t - cm->tables);
+ }));
+
+ if (vec_len(indices))
+ {
+ vlib_cli_output (vm, "%U", format_vnet_classify_table, cm, verbose,
+ ~0 /* hdr */);
+ for (i = 0; i < vec_len (indices); i++)
+ vlib_cli_output (vm, "%U", format_vnet_classify_table, cm,
+ verbose, indices[i]);
+ }
+ else
+ vlib_cli_output (vm, "No classifier tables configured");
+
+ vec_free (indices);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_classify_table_command, static) = {
+ .path = "show classify tables",
+ .short_help = "show classify tables [index <nn>]",
+ .function = show_classify_tables_command_fn,
+};
+
+uword unformat_ip4_match (unformat_input_t * input, va_list * args)
+{
+ u8 ** matchp = va_arg (*args, u8 **);
+ u8 * match = 0;
+ ip4_header_t * ip;
+ int version = 0;
+ u32 version_val;
+ int hdr_length = 0;
+ u32 hdr_length_val;
+ int src = 0, dst = 0;
+ ip4_address_t src_val, dst_val;
+ int proto = 0;
+ u32 proto_val;
+ int tos = 0;
+ u32 tos_val;
+ int length = 0;
+ u32 length_val;
+ int fragment_id = 0;
+ u32 fragment_id_val;
+ int ttl = 0;
+ int ttl_val;
+ int checksum = 0;
+ u32 checksum_val;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "version %d", &version_val))
+ version = 1;
+ else if (unformat (input, "hdr_length %d", &hdr_length_val))
+ hdr_length = 1;
+ else if (unformat (input, "src %U", unformat_ip4_address, &src_val))
+ src = 1;
+ else if (unformat (input, "dst %U", unformat_ip4_address, &dst_val))
+ dst = 1;
+ else if (unformat (input, "proto %d", &proto_val))
+ proto = 1;
+ else if (unformat (input, "tos %d", &tos_val))
+ tos = 1;
+ else if (unformat (input, "length %d", &length_val))
+ length = 1;
+ else if (unformat (input, "fragment_id %d", &fragment_id_val))
+ fragment_id = 1;
+ else if (unformat (input, "ttl %d", &ttl_val))
+ ttl = 1;
+ else if (unformat (input, "checksum %d", &checksum_val))
+ checksum = 1;
+ else
+ break;
+ }
+
+ if (version + hdr_length + src + dst + proto + tos + length + fragment_id
+ + ttl + checksum == 0)
+ return 0;
+
+ /*
+ * Aligned because we use the real comparison functions
+ */
+ vec_validate_aligned (match, sizeof (*ip) - 1, sizeof(u32x4));
+
+ ip = (ip4_header_t *) match;
+
+ /* These are realistically matched in practice */
+ if (src)
+ ip->src_address.as_u32 = src_val.as_u32;
+
+ if (dst)
+ ip->dst_address.as_u32 = dst_val.as_u32;
+
+ if (proto)
+ ip->protocol = proto_val;
+
+
+ /* These are not, but they're included for completeness */
+ if (version)
+ ip->ip_version_and_header_length |= (version_val & 0xF)<<4;
+
+ if (hdr_length)
+ ip->ip_version_and_header_length |= (hdr_length_val & 0xF);
+
+ if (tos)
+ ip->tos = tos_val;
+
+ if (length)
+ ip->length = length_val;
+
+ if (ttl)
+ ip->ttl = ttl_val;
+
+ if (checksum)
+ ip->checksum = checksum_val;
+
+ *matchp = match;
+ return 1;
+}
+
+uword unformat_ip6_match (unformat_input_t * input, va_list * args)
+{
+ u8 ** matchp = va_arg (*args, u8 **);
+ u8 * match = 0;
+ ip6_header_t * ip;
+ int version = 0;
+ u32 version_val;
+ u8 traffic_class = 0;
+ u32 traffic_class_val;
+ u8 flow_label = 0;
+ u8 flow_label_val;
+ int src = 0, dst = 0;
+ ip6_address_t src_val, dst_val;
+ int proto = 0;
+ u32 proto_val;
+ int payload_length = 0;
+ u32 payload_length_val;
+ int hop_limit = 0;
+ int hop_limit_val;
+ u32 ip_version_traffic_class_and_flow_label;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "version %d", &version_val))
+ version = 1;
+ else if (unformat (input, "traffic_class %d", &traffic_class_val))
+ traffic_class = 1;
+ else if (unformat (input, "flow_label %d", &flow_label_val))
+ flow_label = 1;
+ else if (unformat (input, "src %U", unformat_ip6_address, &src_val))
+ src = 1;
+ else if (unformat (input, "dst %U", unformat_ip6_address, &dst_val))
+ dst = 1;
+ else if (unformat (input, "proto %d", &proto_val))
+ proto = 1;
+ else if (unformat (input, "payload_length %d", &payload_length_val))
+ payload_length = 1;
+ else if (unformat (input, "hop_limit %d", &hop_limit_val))
+ hop_limit = 1;
+ else
+ break;
+ }
+
+ if (version + traffic_class + flow_label + src + dst + proto +
+ payload_length + hop_limit == 0)
+ return 0;
+
+ /*
+ * Aligned because we use the real comparison functions
+ */
+ vec_validate_aligned (match, sizeof (*ip) - 1, sizeof(u32x4));
+
+ ip = (ip6_header_t *) match;
+
+ if (src)
+ memcpy (&ip->src_address, &src_val, sizeof (ip->src_address));
+
+ if (dst)
+ memcpy (&ip->dst_address, &dst_val, sizeof (ip->dst_address));
+
+ if (proto)
+ ip->protocol = proto_val;
+
+ ip_version_traffic_class_and_flow_label = 0;
+
+ if (version)
+ ip_version_traffic_class_and_flow_label |= (version_val & 0xF) << 28;
+
+ if (traffic_class)
+ ip_version_traffic_class_and_flow_label |= (traffic_class_val & 0xFF) << 20;
+
+ if (flow_label)
+ ip_version_traffic_class_and_flow_label |= (flow_label_val & 0xFFFFF);
+
+ ip->ip_version_traffic_class_and_flow_label =
+ clib_host_to_net_u32 (ip_version_traffic_class_and_flow_label);
+
+ if (payload_length)
+ ip->payload_length = clib_host_to_net_u16 (payload_length_val);
+
+ if (hop_limit)
+ ip->hop_limit = hop_limit_val;
+
+ *matchp = match;
+ return 1;
+}
+
+uword unformat_l3_match (unformat_input_t * input, va_list * args)
+{
+ u8 ** matchp = va_arg (*args, u8 **);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "ip4 %U", unformat_ip4_match, matchp))
+ return 1;
+ else if (unformat (input, "ip6 %U", unformat_ip6_match, matchp))
+ return 1;
+ /* $$$$ add mpls */
+ else
+ break;
+ }
+ return 0;
+}
+
+uword unformat_vlan_tag (unformat_input_t * input, va_list * args)
+{
+ u8 * tagp = va_arg (*args, u8 *);
+ u32 tag;
+
+ if (unformat(input, "%d", &tag))
+ {
+ tagp[0] = (tag>>8) & 0x0F;
+ tagp[1] = tag & 0xFF;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword unformat_l2_match (unformat_input_t * input, va_list * args)
+{
+ u8 ** matchp = va_arg (*args, u8 **);
+ u8 * match = 0;
+ u8 src = 0;
+ u8 src_val[6];
+ u8 dst = 0;
+ u8 dst_val[6];
+ u8 proto = 0;
+ u16 proto_val;
+ u8 tag1 = 0;
+ u8 tag1_val [2];
+ u8 tag2 = 0;
+ u8 tag2_val [2];
+ int len = 14;
+ u8 ignore_tag1 = 0;
+ u8 ignore_tag2 = 0;
+ u8 cos1 = 0;
+ u8 cos2 = 0;
+ u32 cos1_val = 0;
+ u32 cos2_val = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "src %U", unformat_ethernet_address, &src_val))
+ src = 1;
+ else if (unformat (input, "dst %U", unformat_ethernet_address, &dst_val))
+ dst = 1;
+ else if (unformat (input, "proto %U",
+ unformat_ethernet_type_host_byte_order, &proto_val))
+ proto = 1;
+ else if (unformat (input, "tag1 %U", unformat_vlan_tag, tag1_val))
+ tag1 = 1;
+ else if (unformat (input, "tag2 %U", unformat_vlan_tag, tag2_val))
+ tag2 = 1;
+ else if (unformat (input, "ignore-tag1"))
+ ignore_tag1 = 1;
+ else if (unformat (input, "ignore-tag2"))
+ ignore_tag2 = 1;
+ else if (unformat (input, "cos1 %d", &cos1_val))
+ cos1 = 1;
+ else if (unformat (input, "cos2 %d", &cos2_val))
+ cos2 = 1;
+ else
+ break;
+ }
+ if ((src + dst + proto + tag1 + tag2 +
+ ignore_tag1 + ignore_tag2 + cos1 + cos2) == 0)
+ return 0;
+
+ if (tag1 || ignore_tag1 || cos1)
+ len = 18;
+ if (tag2 || ignore_tag2 || cos2)
+ len = 22;
+
+ vec_validate_aligned (match, len-1, sizeof(u32x4));
+
+ if (dst)
+ memcpy (match, dst_val, 6);
+
+ if (src)
+ memcpy (match + 6, src_val, 6);
+
+ if (tag2)
+ {
+ /* inner vlan tag */
+ match[19] = tag2_val[1];
+ match[18] = tag2_val[0];
+ if (cos2)
+ match [18] |= (cos2_val & 0x7) << 5;
+ if (proto)
+ {
+ match[21] = proto_val & 0xff;
+ match[20] = proto_val >> 8;
+ }
+ if (tag1)
+ {
+ match [15] = tag1_val[1];
+ match [14] = tag1_val[0];
+ }
+ if (cos1)
+ match [14] |= (cos1_val & 0x7) << 5;
+ *matchp = match;
+ return 1;
+ }
+ if (tag1)
+ {
+ match [15] = tag1_val[1];
+ match [14] = tag1_val[0];
+ if (proto)
+ {
+ match[17] = proto_val & 0xff;
+ match[16] = proto_val >> 8;
+ }
+ if (cos1)
+ match [14] |= (cos1_val & 0x7) << 5;
+
+ *matchp = match;
+ return 1;
+ }
+ if (cos2)
+ match [18] |= (cos2_val & 0x7) << 5;
+ if (cos1)
+ match [14] |= (cos1_val & 0x7) << 5;
+ if (proto)
+ {
+ match[13] = proto_val & 0xff;
+ match[12] = proto_val >> 8;
+ }
+
+ *matchp = match;
+ return 1;
+}
+
+
+uword unformat_classify_match (unformat_input_t * input, va_list * args)
+{
+ vnet_classify_main_t * cm = va_arg (*args, vnet_classify_main_t *);
+ u8 ** matchp = va_arg (*args, u8 **);
+ u32 table_index = va_arg (*args, u32);
+ vnet_classify_table_t * t;
+
+ u8 * match = 0;
+ u8 * l2 = 0;
+ u8 * l3 = 0;
+
+ if (pool_is_free_index (cm->tables, table_index))
+ return 0;
+
+ t = pool_elt_at_index (cm->tables, table_index);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "hex %U", unformat_hex_string, &match))
+ ;
+ else if (unformat (input, "l2 %U", unformat_l2_match, &l2))
+ ;
+ else if (unformat (input, "l3 %U", unformat_l3_match, &l3))
+ ;
+ else
+ break;
+ }
+
+ if (match || l2 || l3)
+ {
+ if (l2 || l3)
+ {
+ /* "Win a free Ethernet header in every packet" */
+ if (l2 == 0)
+ vec_validate_aligned (l2, 13, sizeof(u32x4));
+ match = l2;
+ vec_append_aligned (match, l3, sizeof(u32x4));
+ vec_free (l3);
+ }
+
+ /* Make sure the vector is big enough even if key is all 0's */
+ vec_validate_aligned
+ (match, ((t->match_n_vectors + t->skip_n_vectors) * sizeof(u32x4)) - 1,
+ sizeof(u32x4));
+
+ /* Set size, include skipped vectors*/
+ _vec_len (match) = (t->match_n_vectors+t->skip_n_vectors) * sizeof(u32x4);
+
+ *matchp = match;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int vnet_classify_add_del_session (vnet_classify_main_t * cm,
+ u32 table_index,
+ u8 * match,
+ u32 hit_next_index,
+ u32 opaque_index,
+ i32 advance,
+ int is_add)
+{
+ vnet_classify_table_t * t;
+ vnet_classify_entry_5_t _max_e __attribute__((aligned (16)));
+ vnet_classify_entry_t * e;
+ int i, rv;
+
+ if (pool_is_free_index (cm->tables, table_index))
+ return VNET_API_ERROR_NO_SUCH_TABLE;
+
+ t = pool_elt_at_index (cm->tables, table_index);
+
+ e = (vnet_classify_entry_t *)&_max_e;
+ e->next_index = hit_next_index;
+ e->opaque_index = opaque_index;
+ e->advance = advance;
+ e->hits = 0;
+ e->last_heard = 0;
+ e->flags = 0;
+
+ /* Copy key data, honoring skip_n_vectors */
+ memcpy (&e->key, match + t->skip_n_vectors * sizeof (u32x4),
+ t->match_n_vectors * sizeof (u32x4));
+
+ /* Clear don't-care bits; likely when dynamically creating sessions */
+ for (i = 0; i < t->match_n_vectors; i++)
+ e->key[i] &= t->mask[i];
+
+ rv = vnet_classify_add_del (t, e, is_add);
+ if (rv)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ return 0;
+}
+
+static clib_error_t *
+classify_session_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_classify_main_t * cm = &vnet_classify_main;
+ int is_add = 1;
+ u32 table_index = ~0;
+ u32 hit_next_index = ~0;
+ u32 opaque_index = ~0;
+ u8 * match = 0;
+ i32 advance = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "del"))
+ is_add = 0;
+ else if (unformat (input, "hit-next %U", unformat_ip_next_index,
+ &hit_next_index))
+ ;
+ else if (unformat (input, "l2-hit-next %U", unformat_l2_next_index,
+ &hit_next_index))
+ ;
+ else if (unformat (input, "acl-hit-next %U", unformat_acl_next_index,
+ &hit_next_index))
+ ;
+ else if (unformat (input, "opaque-index %d", &opaque_index))
+ ;
+ else if (unformat (input, "match %U", unformat_classify_match,
+ cm, &match, table_index))
+ ;
+ else if (unformat (input, "advance %d", &advance))
+ ;
+ else if (unformat (input, "table-index %d", &table_index))
+ ;
+ else
+ break;
+ }
+
+ if (table_index == ~0)
+ return clib_error_return (0, "Table index required");
+
+ if (is_add && match == 0)
+ return clib_error_return (0, "Match value required");
+
+ rv = vnet_classify_add_del_session (cm, table_index, match,
+ hit_next_index,
+ opaque_index, advance, is_add);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+
+ default:
+ return clib_error_return (0, "vnet_classify_add_del_session returned %d",
+ rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (classify_session_command, static) = {
+ .path = "classify session",
+ .short_help =
+ "classify session [hit-next|l2-hit-next|acl-hit-next <next_index>]"
+ "\n table-index <nn> match [hex] [l2] [l3 ip4]",
+ .function = classify_session_command_fn,
+};
+
+#define TEST_CODE 1
+
+#if TEST_CODE > 0
+static clib_error_t *
+test_classify_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 buckets = 2;
+ u32 sessions = 10;
+ int i, rv;
+ vnet_classify_table_t * t = 0;
+ classify_data_or_mask_t * mask;
+ classify_data_or_mask_t * data;
+ u8 *mp = 0, *dp = 0;
+ vnet_classify_main_t * cm = &vnet_classify_main;
+ vnet_classify_entry_t * e;
+ int is_add = 1;
+ u32 tmp;
+ u32 table_index = ~0;
+ ip4_address_t src;
+ u32 deleted = 0;
+ u32 memory_size = 64<<20;
+
+ /* Default starting address 1.0.0.10 */
+ src.as_u32 = clib_net_to_host_u32 (0x0100000A);
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "sessions %d", &sessions))
+ ;
+ else if (unformat (input, "src %U", unformat_ip4_address, &src))
+ ;
+ else if (unformat (input, "buckets %d", &buckets))
+ ;
+ else if (unformat (input, "memory-size %uM", &tmp))
+ memory_size = tmp<<20;
+ else if (unformat (input, "memory-size %uG", &tmp))
+ memory_size = tmp<<30;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else if (unformat (input, "table %d", &table_index))
+ ;
+ else
+ break;
+ }
+
+ vec_validate_aligned (mp, 3 * sizeof(u32x4), sizeof(u32x4));
+ vec_validate_aligned (dp, 3 * sizeof(u32x4), sizeof(u32x4));
+
+ mask = (classify_data_or_mask_t *) mp;
+ data = (classify_data_or_mask_t *) dp;
+
+ data->ip.src_address.as_u32 = src.as_u32;
+
+ /* Mask on src address */
+ memset (&mask->ip.src_address, 0xff, 4);
+
+ buckets = 1<<max_log2(buckets);
+
+ if (table_index != ~0)
+ {
+ if (pool_is_free_index (cm->tables, table_index))
+ {
+ vlib_cli_output (vm, "No such table %d", table_index);
+ goto out;
+ }
+ t = pool_elt_at_index (cm->tables, table_index);
+ }
+
+ if (is_add)
+ {
+ if (t == 0)
+ {
+ t = vnet_classify_new_table (cm, (u8 *)mask, buckets,
+ memory_size,
+ 0 /* skip */,
+ 3 /* vectors to match */);
+ t->miss_next_index = IP_LOOKUP_NEXT_LOCAL;
+ vlib_cli_output (vm, "Create table %d", t - cm->tables);
+ }
+
+ vlib_cli_output (vm, "Add %d sessions to %d buckets...",
+ sessions, buckets);
+
+ for (i = 0; i < sessions; i++)
+ {
+ rv = vnet_classify_add_del_session (cm, t - cm->tables, (u8 *) data,
+ IP_LOOKUP_NEXT_DROP,
+ i+100 /* opaque_index */,
+ 0 /* advance */,
+ 1 /* is_add */);
+
+ if (rv != 0)
+ clib_warning ("add: returned %d", rv);
+
+ tmp = clib_net_to_host_u32 (data->ip.src_address.as_u32) + 1;
+ data->ip.src_address.as_u32 = clib_net_to_host_u32 (tmp);
+ }
+ goto out;
+ }
+
+ if (t == 0)
+ {
+ vlib_cli_output (vm, "Must specify table index to delete sessions");
+ goto out;
+ }
+
+ vlib_cli_output (vm, "Try to delete %d sessions...", sessions);
+
+ for (i = 0; i < sessions; i++)
+ {
+ u8 * key_minus_skip;
+ u64 hash;
+
+ hash = vnet_classify_hash_packet (t, (u8 *) data);
+
+ e = vnet_classify_find_entry (t, (u8 *) data, hash, 0 /* time_now */);
+ /* Previous delete, perhaps... */
+ if (e == 0)
+ continue;
+ ASSERT (e->opaque_index == (i+100));
+
+ key_minus_skip = (u8 *)e->key;
+ key_minus_skip -= t->skip_n_vectors * sizeof (u32x4);
+
+ rv = vnet_classify_add_del_session (cm, t - cm->tables, key_minus_skip,
+ IP_LOOKUP_NEXT_DROP,
+ i+100 /* opaque_index */,
+ 0 /* advance */,
+ 0 /* is_add */);
+ if (rv != 0)
+ clib_warning ("del: returned %d", rv);
+
+ tmp = clib_net_to_host_u32 (data->ip.src_address.as_u32) + 1;
+ data->ip.src_address.as_u32 = clib_net_to_host_u32 (tmp);
+ deleted++;
+ }
+
+ vlib_cli_output (vm, "Deleted %d sessions...", deleted);
+
+ out:
+ vec_free (mp);
+ vec_free (dp);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_classify_command, static) = {
+ .path = "test classify",
+ .short_help =
+ "test classify [src <ip>] [sessions <nn>] [buckets <nn>] [table <nn>] [del]",
+ .function = test_classify_command_fn,
+};
+#endif /* TEST_CODE */
diff --git a/vnet/vnet/classify/vnet_classify.h b/vnet/vnet/classify/vnet_classify.h
new file mode 100644
index 00000000000..03271ad2e06
--- /dev/null
+++ b/vnet/vnet/classify/vnet_classify.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_vnet_classify_h__
+#define __included_vnet_classify_h__
+
+#include <stdarg.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/api_errno.h> /* for API error numbers */
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/xxhash.h>
+
+vlib_node_registration_t ip4_classify_node;
+vlib_node_registration_t ip6_classify_node;
+
+#define CLASSIFY_TRACE 0
+
+struct _vnet_classify_main;
+typedef struct _vnet_classify_main vnet_classify_main_t;
+
+#define foreach_size_in_u32x4 \
+_(1) \
+_(2) \
+_(3) \
+_(4) \
+_(5)
+
+typedef CLIB_PACKED(struct _vnet_classify_entry {
+ /* Graph node next index */
+ u32 next_index;
+
+ /* put into vnet_buffer(b)->l2_classfy.opaque_index */
+ union {
+ struct {
+ u32 opaque_index;
+ /* advance on hit, note it's a signed quantity... */
+ i32 advance;
+ };
+ u64 opaque_count;
+ };
+
+ /* Really only need 1 bit */
+ u32 flags;
+#define VNET_CLASSIFY_ENTRY_FREE (1<<0)
+
+ /* Hit counter, last heard time */
+ union {
+ u64 hits;
+ struct _vnet_classify_entry * next_free;
+ };
+
+ f64 last_heard;
+
+ /* Must be aligned to a 16-octet boundary */
+ u32x4 key[0];
+}) vnet_classify_entry_t;
+
+static inline int vnet_classify_entry_is_free (vnet_classify_entry_t * e)
+{
+ return e->flags & VNET_CLASSIFY_ENTRY_FREE;
+}
+
+static inline int vnet_classify_entry_is_busy (vnet_classify_entry_t * e)
+{
+ return ((e->flags & VNET_CLASSIFY_ENTRY_FREE) == 0);
+}
+
+/* Need these to con the vector allocator */
+#define _(size) \
+typedef CLIB_PACKED(struct { \
+ u32 pad0[4]; \
+ u64 pad1[2]; \
+ u32x4 key[size]; \
+}) vnet_classify_entry_##size##_t;
+foreach_size_in_u32x4;
+#undef _
+
+typedef struct {
+ union {
+ struct {
+ u32 offset;
+ u8 pad[3];
+ u8 log2_pages;
+ };
+ u64 as_u64;
+ };
+} vnet_classify_bucket_t;
+
+typedef struct {
+ /* Mask to apply after skipping N vectors */
+ u32x4 *mask;
+ /* Buckets and entries */
+ vnet_classify_bucket_t * buckets;
+ vnet_classify_entry_t * entries;
+
+ /* Config parameters */
+ u32 match_n_vectors;
+ u32 skip_n_vectors;
+ u32 nbuckets;
+ u32 log2_nbuckets;
+ int entries_per_page;
+ u32 active_elements;
+ /* Index of next table to try */
+ u32 next_table_index;
+
+ /* Miss next index, return if next_table_index = 0 */
+ u32 miss_next_index;
+
+ /* Per-bucket working copies, one per thread */
+ vnet_classify_entry_t ** working_copies;
+ vnet_classify_bucket_t saved_bucket;
+
+ /* Free entry freelists */
+ vnet_classify_entry_t **freelists;
+
+ u8 * name;
+
+ /* Private allocation arena, protected by the writer lock */
+ void * mheap;
+
+ /* Writer (only) lock for this table */
+ volatile u32 * writer_lock;
+
+} vnet_classify_table_t;
+
+struct _vnet_classify_main {
+ /* Table pool */
+ vnet_classify_table_t * tables;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+};
+
+vnet_classify_main_t vnet_classify_main;
+
+u8 * format_classify_table (u8 * s, va_list * args);
+
+u64 vnet_classify_hash_packet (vnet_classify_table_t * t, u8 * h);
+
+static inline u64
+vnet_classify_hash_packet_inline (vnet_classify_table_t * t,
+ u8 * h)
+{
+ u32x4 *data, *mask;
+
+ union {
+ u32x4 as_u32x4;
+ u64 as_u64[2];
+ } xor_sum __attribute__((aligned(sizeof(u32x4))));
+
+ ASSERT(t);
+
+ data = (u32x4 *)h;
+ mask = t->mask;
+
+ ASSERT ((((u64)h) & 0xf) == 0);
+
+ xor_sum.as_u32x4 = data[0 + t->skip_n_vectors] & mask[0];
+
+ switch (t->match_n_vectors)
+ {
+ case 5:
+ xor_sum.as_u32x4 ^= data[4 + t->skip_n_vectors] & mask[4];
+ /* FALLTHROUGH */
+ case 4:
+ xor_sum.as_u32x4 ^= data[3 + t->skip_n_vectors] & mask[3];
+ /* FALLTHROUGH */
+ case 3:
+ xor_sum.as_u32x4 ^= data[2 + t->skip_n_vectors] & mask[2];
+ /* FALLTHROUGH */
+ case 2:
+ xor_sum.as_u32x4 ^= data[1 + t->skip_n_vectors] & mask[1];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+
+ default:
+ abort();
+ }
+
+ return clib_xxhash (xor_sum.as_u64[0] ^ xor_sum.as_u64[1]);
+}
+
+static inline void
+vnet_classify_prefetch_bucket (vnet_classify_table_t * t, u64 hash)
+{
+ u32 bucket_index;
+
+ ASSERT (is_pow2(t->nbuckets));
+
+ bucket_index = hash & (t->nbuckets - 1);
+
+ CLIB_PREFETCH(&t->buckets[bucket_index], CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static inline vnet_classify_entry_t *
+vnet_classify_get_entry (vnet_classify_table_t * t, uword offset)
+{
+ u8 * hp = t->mheap;
+ u8 * vp = hp + offset;
+
+ return (void *) vp;
+}
+
+static inline uword vnet_classify_get_offset (vnet_classify_table_t * t,
+ vnet_classify_entry_t * v)
+{
+ u8 * hp, * vp;
+
+ hp = (u8 *) t->mheap;
+ vp = (u8 *) v;
+
+ ASSERT((vp - hp) < 0x100000000ULL);
+ return vp - hp;
+}
+
+static inline vnet_classify_entry_t *
+vnet_classify_entry_at_index (vnet_classify_table_t * t,
+ vnet_classify_entry_t * e,
+ u32 index)
+{
+ u8 * eu8;
+
+ eu8 = (u8 *)e;
+
+ eu8 += index * (sizeof (vnet_classify_entry_t) +
+ (t->match_n_vectors * sizeof (u32x4)));
+
+ return (vnet_classify_entry_t *) eu8;
+}
+
+static inline void
+vnet_classify_prefetch_entry (vnet_classify_table_t * t,
+ u64 hash)
+{
+ u32 bucket_index;
+ u32 value_index;
+ vnet_classify_bucket_t * b;
+ vnet_classify_entry_t * e;
+
+ bucket_index = hash & (t->nbuckets - 1);
+
+ b = &t->buckets[bucket_index];
+
+ if (b->offset == 0)
+ return;
+
+ hash >>= t->log2_nbuckets;
+
+ e = vnet_classify_get_entry (t, b->offset);
+ value_index = hash & ((1<<b->log2_pages)-1);
+
+ e = vnet_classify_entry_at_index (t, e, value_index);
+
+ CLIB_PREFETCH(e, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+vnet_classify_entry_t *
+vnet_classify_find_entry (vnet_classify_table_t * t,
+ u8 * h, u64 hash, f64 now);
+
+static inline vnet_classify_entry_t *
+vnet_classify_find_entry_inline (vnet_classify_table_t * t,
+ u8 * h, u64 hash, f64 now)
+ {
+ vnet_classify_entry_t * v;
+ u32x4 * mask, * data, *data_start, * key;
+ u32x4 result __attribute__((aligned(sizeof(u32x4))));
+ vnet_classify_bucket_t * b;
+ u32 value_index;
+ u32 result_mask;
+ u32 bucket_index;
+ int i;
+
+ ASSERT ((((u64)h) & 0xf) == 0);
+
+ data_start = (u32x4 *) h;
+
+ bucket_index = hash & (t->nbuckets-1);
+ b = &t->buckets[bucket_index];
+
+ if (b->offset == 0)
+ return 0;
+
+ hash >>= t->log2_nbuckets;
+
+ v = vnet_classify_get_entry (t, b->offset);
+ value_index = hash & ((1<<b->log2_pages)-1);
+
+ v = vnet_classify_entry_at_index (t, v, value_index);
+
+ for (i = 0; i < t->entries_per_page; i++)
+ {
+ mask = t->mask;
+ data = data_start;
+ key = v->key;
+
+ switch (t->match_n_vectors)
+ {
+ case 1:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ break;
+
+ case 2:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ break;
+
+ case 3:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ result |= (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
+ break;
+
+ case 4:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ result |= (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
+ result |= (data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
+ break;
+
+ case 5:
+ result = (data[0 + t->skip_n_vectors] & mask[0]) ^ key[0];
+ result |= (data[1 + t->skip_n_vectors] & mask[1]) ^ key[1];
+ result |= (data[2 + t->skip_n_vectors] & mask[2]) ^ key[2];
+ result |= (data[3 + t->skip_n_vectors] & mask[3]) ^ key[3];
+ result |= (data[4 + t->skip_n_vectors] & mask[4]) ^ key[4];
+ break;
+
+ default:
+ abort();
+ }
+
+ result_mask = u32x4_zero_byte_mask (result);
+ if (result_mask == 0xffff)
+ {
+ if (PREDICT_TRUE(now))
+ {
+ v->hits++;
+ v->last_heard = now;
+ }
+ return (v);
+ }
+ v = vnet_classify_entry_at_index (t, v, 1);
+ }
+ return 0;
+}
+
+vnet_classify_table_t *
+vnet_classify_new_table (vnet_classify_main_t *cm,
+ u8 * mask, u32 nbuckets, u32 memory_size,
+ u32 skip_n_vectors,
+ u32 match_n_vectors);
+
+int vnet_classify_add_del_session (vnet_classify_main_t * cm,
+ u32 table_index,
+ u8 * match,
+ u32 hit_next_index,
+ u32 opaque_index,
+ i32 advance,
+ int is_add);
+
+int vnet_classify_add_del_table (vnet_classify_main_t * cm,
+ u8 * mask,
+ u32 nbuckets,
+ u32 memory_size,
+ u32 skip,
+ u32 match,
+ u32 next_table_index,
+ u32 miss_next_index,
+ u32 * table_index,
+ int is_add);
+
+unformat_function_t unformat_ip4_mask;
+unformat_function_t unformat_ip6_mask;
+unformat_function_t unformat_l3_mask;
+unformat_function_t unformat_l2_mask;
+unformat_function_t unformat_classify_mask;
+unformat_function_t unformat_l2_next_index;
+unformat_function_t unformat_ip_next_index;
+unformat_function_t unformat_ip4_match;
+unformat_function_t unformat_ip6_match;
+unformat_function_t unformat_l3_match;
+unformat_function_t unformat_vlan_tag;
+unformat_function_t unformat_l2_match;
+unformat_function_t unformat_classify_match;
+
+#endif /* __included_vnet_classify_h__ */
diff --git a/vnet/vnet/config.c b/vnet/vnet/config.c
new file mode 100644
index 00000000000..74c4caa847f
--- /dev/null
+++ b/vnet/vnet/config.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * config.c: feature configuration
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+
+static vnet_config_feature_t *
+duplicate_feature_vector (vnet_config_feature_t * feature_vector)
+{
+ vnet_config_feature_t * result, * f;
+
+ result = vec_dup (feature_vector);
+ vec_foreach (f, result)
+ f->feature_config = vec_dup (f->feature_config);
+
+ return result;
+}
+
+static void
+free_feature_vector (vnet_config_feature_t * feature_vector)
+{
+ vnet_config_feature_t * f;
+
+ vec_foreach (f, feature_vector)
+ vnet_config_feature_free (f);
+ vec_free (feature_vector);
+}
+
+static u32
+add_next (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ u32 last_node_index,
+ u32 this_node_index)
+{
+ u32 i, ni = ~0;
+
+ if (last_node_index != ~0)
+ return vlib_node_add_next (vm, last_node_index, this_node_index);
+
+ for (i = 0; i < vec_len (cm->start_node_indices); i++)
+ {
+ u32 tmp;
+ tmp = vlib_node_add_next (vm, cm->start_node_indices[i], this_node_index);
+ if (ni == ~0)
+ ni = tmp;
+ /* Start nodes to first must agree on next indices. */
+ ASSERT (ni == tmp);
+ }
+
+ return ni;
+}
+
+static vnet_config_t *
+find_config_with_features (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ vnet_config_feature_t * feature_vector)
+{
+ u32 last_node_index = ~0;
+ vnet_config_feature_t * f;
+ u32 * config_string;
+ uword * p;
+ vnet_config_t * c;
+
+ config_string = cm->config_string_temp;
+ cm->config_string_temp = 0;
+ if (config_string)
+ _vec_len (config_string) = 0;
+
+ vec_foreach (f, feature_vector)
+ {
+ /* Connect node graph. */
+ f->next_index = add_next (vm, cm, last_node_index, f->node_index);
+ last_node_index = f->node_index;
+
+ /* Store next index in config string. */
+ vec_add1 (config_string, f->next_index);
+
+ /* Store feature config. */
+ vec_add (config_string, f->feature_config, vec_len (f->feature_config));
+ }
+
+ /* Terminate config string with next for end node. */
+ if (last_node_index == ~0 || last_node_index != cm->end_node_index)
+ {
+ u32 next_index = add_next (vm, cm, last_node_index, cm->end_node_index);
+ vec_add1 (config_string, next_index);
+ }
+
+ /* See if config string is unique. */
+ p = hash_get_mem (cm->config_string_hash, config_string);
+ if (p)
+ {
+ /* Not unique. Share existing config. */
+ cm->config_string_temp = config_string; /* we'll use it again later. */
+ free_feature_vector (feature_vector);
+ c = pool_elt_at_index (cm->config_pool, p[0]);
+ }
+ else
+ {
+ u32 * d;
+
+ pool_get (cm->config_pool, c);
+ c->index = c - cm->config_pool;
+ c->features = feature_vector;
+ c->config_string_vector = config_string;
+
+ /* Allocate copy of config string in heap.
+ VLIB buffers will maintain pointers to heap as they read out
+ configuration data. */
+ c->config_string_heap_index
+ = heap_alloc (cm->config_string_heap, vec_len (config_string) + 1,
+ c->config_string_heap_handle);
+
+ /* First element in heap points back to pool index. */
+ d = vec_elt_at_index (cm->config_string_heap, c->config_string_heap_index);
+ d[0] = c->index;
+ memcpy (d + 1, config_string, vec_bytes (config_string));
+ hash_set_mem (cm->config_string_hash, config_string, c->index);
+
+ c->reference_count = 0; /* will be incremented by caller. */
+ }
+
+ return c;
+}
+
+void vnet_config_init (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ char * start_node_names[],
+ int n_start_node_names,
+ char * feature_node_names[],
+ int n_feature_node_names)
+{
+ vlib_node_t * n;
+ u32 i;
+
+ memset (cm, 0, sizeof (cm[0]));
+
+ cm->config_string_hash = hash_create_vec (0, STRUCT_SIZE_OF (vnet_config_t, config_string_vector[0]), sizeof (uword));
+
+ ASSERT (n_start_node_names >= 1);
+ ASSERT (n_feature_node_names >= 1);
+
+ vec_resize (cm->start_node_indices, n_start_node_names);
+ for (i = 0; i < n_start_node_names; i++)
+ {
+ n = vlib_get_node_by_name (vm, (u8 *) start_node_names[i]);
+ /* Given node name must exist. */
+ ASSERT (n != 0);
+ cm->start_node_indices[i] = n->index;
+ }
+
+ vec_resize (cm->node_index_by_feature_index, n_feature_node_names);
+ for (i = 0; i < n_feature_node_names; i++)
+ {
+ if (! feature_node_names[i])
+ cm->node_index_by_feature_index[i] = ~0;
+ else
+ {
+ n = vlib_get_node_by_name (vm, (u8 *) feature_node_names[i]);
+ /* Given node may exist in plug-in library which is not present */
+ if (n)
+ {
+ if (i + 1 == n_feature_node_names)
+ cm->end_node_index = n->index;
+ cm->node_index_by_feature_index[i] = n->index;
+ }
+ else cm->node_index_by_feature_index[i] = ~0;
+ }
+ }
+}
+
+static void
+remove_reference (vnet_config_main_t * cm, vnet_config_t * c)
+{
+ ASSERT (c->reference_count > 0);
+ c->reference_count -= 1;
+ if (c->reference_count == 0)
+ {
+ hash_unset (cm->config_string_hash, c->config_string_vector);
+ vnet_config_free (cm, c);
+ pool_put (cm->config_pool, c);
+ }
+}
+
+always_inline u32 *
+vnet_get_config_heap (vnet_config_main_t * cm, u32 ci)
+{ return heap_elt_at_index (cm->config_string_heap, ci); }
+
+u32 vnet_config_add_feature (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ u32 config_string_heap_index,
+ u32 feature_index,
+ void * feature_config,
+ u32 n_feature_config_bytes)
+{
+ vnet_config_t * old, * new;
+ vnet_config_feature_t * new_features, * f;
+ u32 n_feature_config_u32s;
+ u32 node_index = vec_elt (cm->node_index_by_feature_index, feature_index);
+
+ if (node_index == ~0) // feature node does not exist
+ return config_string_heap_index; // return original config index
+
+ if (config_string_heap_index == ~0)
+ {
+ old = 0;
+ new_features = 0;
+ }
+ else
+ {
+ u32 * p = vnet_get_config_heap (cm, config_string_heap_index);
+ old = pool_elt_at_index (cm->config_pool, p[-1]);
+ new_features = old->features;
+ if (new_features)
+ new_features = duplicate_feature_vector (new_features);
+ }
+
+ vec_add2 (new_features, f, 1);
+ f->feature_index = feature_index;
+ f->node_index = node_index;
+
+ n_feature_config_u32s = round_pow2 (n_feature_config_bytes, sizeof (f->feature_config[0])) / sizeof (f->feature_config[0]);
+ vec_add (f->feature_config, feature_config, n_feature_config_u32s);
+
+ /* Sort (prioritize) features. */
+ if (vec_len (new_features) > 1)
+ vec_sort (new_features, f1, f2, (int) f1->feature_index - f2->feature_index);
+
+ if (old)
+ remove_reference (cm, old);
+
+ new = find_config_with_features (vm, cm, new_features);
+ new->reference_count += 1;
+
+ /* User gets pointer to config string first element (which defines the pool index
+ this config string comes from). */
+ return new->config_string_heap_index + 1;
+}
+
+u32 vnet_config_del_feature (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ u32 config_string_heap_index,
+ u32 feature_index,
+ void * feature_config,
+ u32 n_feature_config_bytes)
+{
+ vnet_config_t * old, * new;
+ vnet_config_feature_t * new_features, * f;
+ u32 n_feature_config_u32s;
+
+ {
+ u32 * p = vnet_get_config_heap (cm, config_string_heap_index);
+
+ old = pool_elt_at_index (cm->config_pool, p[-1]);
+ }
+
+ n_feature_config_u32s = round_pow2 (n_feature_config_bytes, sizeof (f->feature_config[0])) / sizeof (f->feature_config[0]);
+
+ /* Find feature with same index and opaque data. */
+ vec_foreach (f, old->features)
+ {
+ if (f->feature_index == feature_index
+ && vec_len (f->feature_config) == n_feature_config_u32s
+ && (n_feature_config_u32s == 0
+ || ! memcmp (f->feature_config, feature_config, n_feature_config_bytes)))
+ break;
+ }
+
+ /* Feature not found. */
+ if (f >= vec_end (old->features))
+ return config_string_heap_index; // return original config index
+
+ new_features = duplicate_feature_vector (old->features);
+ f = new_features + (f - old->features);
+ vnet_config_feature_free (f);
+ vec_delete (new_features, 1, f - new_features);
+
+ /* must remove old from config_pool now as it may be expanded and change
+ memory location if the following function find_config_with_features()
+ adds a new config because none of existing config's has matching features
+ and so can be reused */
+ remove_reference (cm, old);
+ new = find_config_with_features (vm, cm, new_features);
+ new->reference_count += 1;
+
+ return new->config_string_heap_index + 1;
+}
diff --git a/vnet/vnet/config.h b/vnet/vnet/config.h
new file mode 100644
index 00000000000..1ace30fef88
--- /dev/null
+++ b/vnet/vnet/config.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * config.h: feature configuration
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_config_h
+#define included_vnet_config_h
+
+#include <vlib/vlib.h>
+#include <vppinfra/heap.h>
+
+typedef struct {
+ /* Features are prioritized by index. Smaller indices get
+ performed first. */
+ u32 feature_index;
+
+ /* VLIB node which performs feature. */
+ u32 node_index;
+
+ /* Next index relative to previous node or main node. */
+ u32 next_index;
+
+ /* Opaque per feature configuration data. */
+ u32 * feature_config;
+} vnet_config_feature_t;
+
+always_inline void
+vnet_config_feature_free (vnet_config_feature_t * f)
+{ vec_free (f->feature_config); }
+
+typedef struct {
+ /* Sorted vector of features for this configuration. */
+ vnet_config_feature_t * features;
+
+ /* Config string as vector for hashing. */
+ u32 * config_string_vector;
+
+ /* Config string including all next indices and feature data as a vector. */
+ u32 config_string_heap_index, config_string_heap_handle;
+
+ /* Index in main pool. */
+ u32 index;
+
+ /* Number of interfaces/traffic classes that reference this config. */
+ u32 reference_count;
+} vnet_config_t;
+
+typedef struct {
+ /* Pool of configs. Index 0 is always null config and is never deleted. */
+ vnet_config_t * config_pool;
+
+ /* Hash table mapping vector config string to config pool index. */
+ uword * config_string_hash;
+
+ /* Global heap of configuration data. */
+ u32 * config_string_heap;
+
+ /* Node index which starts/ends feature processing. */
+ u32 * start_node_indices, end_node_index;
+
+ /* Interior feature processing nodes (not including start and end nodes). */
+ u32 * node_index_by_feature_index;
+
+ /* Temporary vector for holding config strings. Used to avoid continually
+ allocating vectors. */
+ u32 * config_string_temp;
+} vnet_config_main_t;
+
+always_inline void
+vnet_config_free (vnet_config_main_t * cm, vnet_config_t * c)
+{
+ vnet_config_feature_t * f;
+ vec_foreach (f, c->features)
+ vnet_config_feature_free (f);
+ vec_free (c->features);
+ heap_dealloc (cm->config_string_heap, c->config_string_heap_handle);
+ vec_free (c->config_string_vector);
+}
+
+always_inline void *
+vnet_get_config_data (vnet_config_main_t * cm,
+ u32 * config_index,
+ u32 * next_index,
+ u32 n_data_bytes)
+{
+ u32 i, n, * d;
+
+ i = *config_index;
+
+ d = heap_elt_at_index (cm->config_string_heap, i);
+
+ n = round_pow2 (n_data_bytes, sizeof (d[0])) / sizeof (d[0]);
+
+ /* Last 32 bits are next index. */
+ *next_index = d[n];
+
+ /* Advance config index to next config. */
+ *config_index = (i + n + 1);
+
+ /* Return config data to user for this feature. */
+ return (void *) d;
+}
+
+void vnet_config_init (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ char * start_node_names[],
+ int n_start_node_names,
+ char * feature_node_names[],
+ int n_feature_node_names);
+
+/* Calls to add/delete features from configurations. */
+u32 vnet_config_add_feature (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ u32 config_id,
+ u32 feature_index,
+ void * feature_config,
+ u32 n_feature_config_bytes);
+
+u32 vnet_config_del_feature (vlib_main_t * vm,
+ vnet_config_main_t * cm,
+ u32 config_id,
+ u32 feature_index,
+ void * feature_config,
+ u32 n_feature_config_bytes);
+
+#endif /* included_vnet_config_h */
diff --git a/vnet/vnet/devices/dpdk/cli.c b/vnet/vnet/devices/dpdk/cli.c
new file mode 100644
index 00000000000..c27dbfabfc0
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/cli.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls-gre/packet.h>
+
+#include "dpdk_priv.h"
+
+frame_queue_trace_t *frame_queue_traces;
+
+static clib_error_t *
+pcap_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u8 * filename;
+ u32 max;
+ int matched = 0;
+ clib_error_t * error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "on"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ if (dm->pcap_filename == 0)
+ dm->pcap_filename = format (0, "/tmp/vpe.pcap%c", 0);
+
+ memset (&dm->pcap_main, 0, sizeof (dm->pcap_main));
+ dm->pcap_main.file_name = (char *) dm->pcap_filename;
+ dm->pcap_main.n_packets_to_capture = 100;
+ if (dm->pcap_pkts_to_capture)
+ dm->pcap_main.n_packets_to_capture = dm->pcap_pkts_to_capture;
+
+ dm->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ dm->tx_pcap_enable = 1;
+ matched = 1;
+ vlib_cli_output (vm, "pcap tx capture on...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already on...");
+ }
+ matched = 1;
+ }
+ else if (unformat (input, "off"))
+ {
+ if (dm->tx_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ dm->pcap_main.n_packets_captured+1);
+ if (dm->pcap_main.n_packets_captured)
+ {
+ dm->pcap_main.n_packets_to_capture =
+ dm->pcap_main.n_packets_captured;
+ error = pcap_write (&dm->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", dm->pcap_filename);
+ }
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap tx capture already off...");
+ }
+
+ dm->tx_pcap_enable = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "max %d", &max))
+ {
+ dm->pcap_pkts_to_capture = max;
+ matched = 1;
+ }
+
+ else if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dm->vnet_main,
+ &dm->pcap_sw_if_index))
+ matched = 1;
+ else if (unformat (input, "intfc any"))
+ {
+ dm->pcap_sw_if_index = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "file %s", &filename))
+ {
+ u8 * chroot_filename;
+ /* Brain-police user path input */
+ if (strstr((char *)filename, "..") || index((char *)filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ continue;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+
+ if (dm->pcap_filename)
+ vec_free (dm->pcap_filename);
+ vec_add1 (filename, 0);
+ dm->pcap_filename = chroot_filename;
+ matched = 1;
+ }
+ else if (unformat (input, "status"))
+ {
+ if (dm->tx_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap tx capture is off...");
+ continue;
+ }
+
+ vlib_cli_output (vm, "pcap tx capture: %d of %d pkts...",
+ dm->pcap_main.n_packets_captured,
+ dm->pcap_main.n_packets_to_capture);
+ matched = 1;
+ }
+
+ else
+ break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap tx trace",
+ .short_help =
+ "pcap tx trace on off max <nn> intfc <intfc> file <name> status",
+ .function = pcap_trace_command_fn,
+};
+
+
+static clib_error_t *
+show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ struct rte_mempool * rmp;
+ int i;
+
+ for(i = 0; i < vec_len(vm->buffer_main->pktmbuf_pools); i++)
+ {
+ rmp = vm->buffer_main->pktmbuf_pools[i];
+ if (rmp)
+ {
+ unsigned count = rte_mempool_count(rmp);
+ unsigned free_count = rte_mempool_free_count(rmp);
+
+ vlib_cli_output(vm, "name=\"%s\" available = %7d allocated = %7d total = %7d\n",
+ rmp->name, (u32)count, (u32)free_count,
+ (u32)(count+free_count));
+ }
+ else
+ {
+ vlib_cli_output(vm, "rte_mempool is NULL (!)\n");
+ }
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_show_dpdk_bufferr,static) = {
+ .path = "show dpdk buffer",
+ .short_help = "show dpdk buffer state",
+ .function = show_dpdk_buffer,
+};
+
+static clib_error_t *
+test_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static u32 * allocated_buffers;
+ u32 n_alloc = 0;
+ u32 n_free = 0;
+ u32 first, actual_alloc;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "allocate %d", &n_alloc))
+ ;
+ else if (unformat (input, "free %d", &n_free))
+ ;
+ else
+ break;
+ }
+
+ if (n_free)
+ {
+ if (vec_len (allocated_buffers) < n_free)
+ return clib_error_return (0, "Can't free %d, only %d allocated",
+ n_free, vec_len (allocated_buffers));
+
+ first = vec_len(allocated_buffers) - n_free;
+ vlib_buffer_free (vm, allocated_buffers + first, n_free);
+ _vec_len (allocated_buffers) = first;
+ }
+ if (n_alloc)
+ {
+ first = vec_len (allocated_buffers);
+ vec_validate (allocated_buffers,
+ vec_len (allocated_buffers) + n_alloc - 1);
+
+ actual_alloc = vlib_buffer_alloc (vm, allocated_buffers + first,
+ n_alloc);
+ _vec_len (allocated_buffers) = first + actual_alloc;
+
+ if (actual_alloc < n_alloc)
+ vlib_cli_output (vm, "WARNING: only allocated %d buffers",
+ actual_alloc);
+ }
+
+ vlib_cli_output (vm, "Currently %d buffers allocated",
+ vec_len (allocated_buffers));
+
+ if (allocated_buffers && vec_len(allocated_buffers) == 0)
+ vec_free(allocated_buffers);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_test_dpdk_bufferr,static) = {
+ .path = "test dpdk buffer",
+ .short_help = "test dpdk buffer [allocate <nn>][free <nn>]",
+ .function = test_dpdk_buffer,
+};
+
+static void
+show_dpdk_device_stats (vlib_main_t * vm, dpdk_device_t * xd)
+{
+ vlib_cli_output(vm,
+ "device_index %d\n"
+ " last_burst_sz %d\n"
+ " max_burst_sz %d\n"
+ " full_frames_cnt %u\n"
+ " consec_full_frames_cnt %u\n"
+ " congestion_cnt %d\n"
+ " last_poll_time %llu\n"
+ " max_poll_delay %llu\n"
+ " discard_cnt %u\n"
+ " total_packet_cnt %u\n",
+ xd->device_index,
+ xd->efd_agent.last_burst_sz,
+ xd->efd_agent.max_burst_sz,
+ xd->efd_agent.full_frames_cnt,
+ xd->efd_agent.consec_full_frames_cnt,
+ xd->efd_agent.congestion_cnt,
+ xd->efd_agent.last_poll_time,
+ xd->efd_agent.max_poll_delay,
+ xd->efd_agent.discard_cnt,
+ xd->efd_agent.total_packet_cnt);
+
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ 0 /* queue_id */);
+ vlib_cli_output(vm,
+ " device_queue_sz %u\n",
+ device_queue_sz);
+}
+
+
+/*
+ * Trigger threads to grab frame queue trace data
+ */
+static clib_error_t *
+trace_frame_queue (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ frame_queue_trace_t *fqt;
+ u32 num_fq;
+ u32 fqix;
+ u32 enable = 0;
+
+ if (unformat(input, "on")) {
+ enable = 1;
+ } else if (unformat(input, "off")) {
+ enable = 0;
+ } else {
+ return clib_error_return(0, "expecting on or off");
+ }
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ // Allocate storage for trace if necessary
+ vec_validate_aligned(frame_queue_traces, num_fq-1, CLIB_CACHE_LINE_BYTES);
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ fqt = &frame_queue_traces[fqix];
+
+ memset(fqt->n_vectors, 0xff, sizeof(fqt->n_vectors));
+ fqt->written = 0;
+ vlib_frame_queues[fqix]->trace = enable;
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = {
+ .path = "trace frame-queue",
+ .short_help = "trace frame-queue (on|off)",
+ .function = trace_frame_queue,
+};
+
+
+/*
+ * Display frame queue trace data gathered by threads.
+ */
+static clib_error_t *
+show_frame_queue (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ frame_queue_trace_t *fqt;
+ u32 num_fq;
+ u32 fqix;
+
+ num_fq = vec_len(frame_queue_traces);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No trace data for frame queues\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ fqt = &frame_queue_traces[fqix];
+
+ vlib_cli_output(vm, "Thread %d %v\n", fqix, vlib_worker_threads[fqix].name);
+
+ if (fqt->written == 0)
+ {
+ vlib_cli_output(vm, " no trace data\n");
+ continue;
+ }
+
+ vlib_cli_output(vm, " vector-threshold %d ring size %d in use %d\n",
+ fqt->threshold, fqt->nelts, fqt->n_in_use);
+ vlib_cli_output(vm, " head %12d head_hint %12d tail %12d\n",
+ fqt->head, fqt->head_hint, fqt->tail);
+ vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+ fqt->n_vectors[0], fqt->n_vectors[1], fqt->n_vectors[2], fqt->n_vectors[3],
+ fqt->n_vectors[4], fqt->n_vectors[5], fqt->n_vectors[6], fqt->n_vectors[7],
+ fqt->n_vectors[8], fqt->n_vectors[9], fqt->n_vectors[10], fqt->n_vectors[11],
+ fqt->n_vectors[12], fqt->n_vectors[13], fqt->n_vectors[14], fqt->n_vectors[15]);
+
+ if (fqt->nelts > 16)
+ {
+ vlib_cli_output(vm, " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+ fqt->n_vectors[16], fqt->n_vectors[17], fqt->n_vectors[18], fqt->n_vectors[19],
+ fqt->n_vectors[20], fqt->n_vectors[21], fqt->n_vectors[22], fqt->n_vectors[23],
+ fqt->n_vectors[24], fqt->n_vectors[25], fqt->n_vectors[26], fqt->n_vectors[27],
+ fqt->n_vectors[28], fqt->n_vectors[29], fqt->n_vectors[30], fqt->n_vectors[31]);
+ }
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_show_frame_queue,static) = {
+ .path = "show frame-queue",
+ .short_help = "show frame-queue trace",
+ .function = show_frame_queue,
+};
+
+
+/*
+ * Modify the number of elements on the frame_queues
+ */
+static clib_error_t *
+test_frame_queue_nelts (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ u32 num_fq;
+ u32 fqix;
+ u32 nelts = 0;
+
+ unformat(input, "%d", &nelts);
+ if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) {
+ return clib_error_return(0, "expecting 4,8,16,32");
+ }
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ vlib_frame_queues[fqix]->nelts = nelts;
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = {
+ .path = "test frame-queue nelts",
+ .short_help = "test frame-queue nelts (4,8,16,32)",
+ .function = test_frame_queue_nelts,
+};
+
+
+/*
+ * Modify the max number of packets pulled off the frame queues
+ */
+static clib_error_t *
+test_frame_queue_threshold (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ clib_error_t * error = NULL;
+ u32 num_fq;
+ u32 fqix;
+ u32 threshold = 0;
+
+ if (unformat(input, "%d", &threshold)) {
+ } else {
+ vlib_cli_output(vm, "expecting threshold value\n");
+ return error;
+ }
+
+ if (threshold == 0)
+ threshold = ~0;
+
+ num_fq = vec_len(vlib_frame_queues);
+ if (num_fq == 0)
+ {
+ vlib_cli_output(vm, "No frame queues exist\n");
+ return error;
+ }
+
+ for (fqix=0; fqix<num_fq; fqix++) {
+ vlib_frame_queues[fqix]->vector_threshold = threshold;
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = {
+ .path = "test frame-queue threshold",
+ .short_help = "test frame-queue threshold N (0=no limit)",
+ .function = test_frame_queue_threshold,
+};
+
+static void
+show_efd_config (vlib_main_t * vm)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ vlib_cli_output(vm,
+ "dpdk: (0x%04x) enabled:%d monitor:%d drop_all:%d\n"
+ " dpdk_queue_hi_thresh %d\n"
+ " consec_full_frames_hi_thresh %d\n"
+ "---------\n"
+ "worker: (0x%04x) enabled:%d monitor:%d\n"
+ " worker_queue_hi_thresh %d\n",
+ dm->efd.enabled,
+ ((dm->efd.enabled & DPDK_EFD_DISCARD_ENABLED) ? 1:0),
+ ((dm->efd.enabled & DPDK_EFD_MONITOR_ENABLED) ? 1:0),
+ ((dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED) ? 1:0),
+ dm->efd.queue_hi_thresh,
+ dm->efd.consec_full_frames_hi_thresh,
+ tm->efd.enabled,
+ ((tm->efd.enabled & VLIB_EFD_DISCARD_ENABLED) ? 1:0),
+ ((dm->efd.enabled & VLIB_EFD_MONITOR_ENABLED) ? 1:0),
+ tm->efd.queue_hi_thresh);
+ vlib_cli_output(vm,
+ "---------\n"
+ "ip_prec_bitmap 0x%02x\n"
+ "mpls_exp_bitmap 0x%02x\n"
+ "vlan_cos_bitmap 0x%02x\n",
+ tm->efd.ip_prec_bitmap,
+ tm->efd.mpls_exp_bitmap,
+ tm->efd.vlan_cos_bitmap);
+}
+
+static clib_error_t *
+show_efd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+
+ if (unformat(input, "config")) {
+ show_efd_config(vm);
+ } else if (unformat(input, "dpdk")) {
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ u32 device_id = ~0;
+
+ unformat(input, "device %d", &device_id);
+ vec_foreach (xd, dm->devices) {
+ if ((xd->device_index == device_id) || (device_id == ~0)) {
+ show_dpdk_device_stats(vm, xd);
+ }
+ }
+ } else if (unformat(input, "worker")) {
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_frame_queue_t *fq;
+ vlib_thread_registration_t * tr;
+ int thread_id;
+ u32 num_workers = 0;
+ u32 first_worker_index = 0;
+ uword * p;
+
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ ASSERT (p);
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+
+ vlib_cli_output(vm,
+ "num_workers %d\n"
+ "first_worker_index %d\n"
+ "vlib_frame_queues[%d]:\n",
+ num_workers,
+ first_worker_index,
+ tm->n_vlib_mains);
+
+ for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) {
+ fq = vlib_frame_queues[thread_id];
+ if (fq) {
+ vlib_cli_output(vm,
+ "%2d: frames_queued %u\n"
+ " frames_queued_hint %u\n"
+ " enqueue_full_events %u\n"
+ " enqueue_efd_discards %u\n",
+ thread_id,
+ (fq->tail - fq->head),
+ (fq->tail - fq->head_hint),
+ fq->enqueue_full_events,
+ fq->enqueue_efd_discards);
+ }
+ }
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm, "Usage: show efd config | "
+ "dpdk [device <id>] | worker\n");
+ } else {
+ show_efd_config(vm);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_efd_command, static) = {
+ .path = "show efd",
+ .short_help = "Show efd [device <id>] | [config]",
+ .function = show_efd,
+};
+
+static clib_error_t *
+clear_efd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_frame_queue_t *fq;
+ int thread_id;
+
+ vec_foreach (xd, dm->devices) {
+ xd->efd_agent.last_burst_sz = 0;
+ xd->efd_agent.max_burst_sz = 0;
+ xd->efd_agent.full_frames_cnt = 0;
+ xd->efd_agent.consec_full_frames_cnt = 0;
+ xd->efd_agent.congestion_cnt = 0;
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.max_poll_delay = 0;
+ xd->efd_agent.discard_cnt = 0;
+ xd->efd_agent.total_packet_cnt = 0;
+ }
+
+ for (thread_id = 0; thread_id < tm->n_vlib_mains; thread_id++) {
+ fq = vlib_frame_queues[thread_id];
+ if (fq) {
+ fq->enqueue_full_events = 0;
+ fq->enqueue_efd_discards = 0;
+ }
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_efd_command,static) = {
+ .path = "clear efd",
+ .short_help = "Clear early-fast-discard counters",
+ .function = clear_efd,
+};
+
+static clib_error_t *
+parse_op_and_prec (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd,
+ char *prec_type, u8 *prec_bitmap)
+{
+ clib_error_t * error = NULL;
+ u8 op = 0;
+ u8 prec = 0;
+
+ if (unformat(input, "ge")) {
+ op = EFD_OPERATION_GREATER_OR_EQUAL;
+ } else if (unformat(input, "lt")) {
+ op = EFD_OPERATION_LESS_THAN;
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm,
+ "enter operation [ge | lt] and precedence <0-7>)");
+ return (error);
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (unformat (input, "%u", &prec)) {
+ if (prec > 7) {
+ return clib_error_return(0, "precedence %d is out of range <0-7>",
+ prec);
+ }
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ set_efd_bitmap(prec_bitmap, prec, op);
+
+ vlib_cli_output(vm,
+ "EFD will be set for %s precedence %s%u%s.",
+ prec_type,
+ (op == EFD_OPERATION_LESS_THAN) ? "less than " : "",
+ prec,
+ (op == EFD_OPERATION_GREATER_OR_EQUAL) ? " and greater" : "");
+
+ return (error);
+}
+
+
+static clib_error_t *
+set_efd (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ clib_error_t * error = NULL;
+
+ if (unformat(input, "enable")) {
+ if (unformat(input, "dpdk")) {
+ dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "worker")) {
+ tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "monitor")) {
+ dm->efd.enabled |= DPDK_EFD_MONITOR_ENABLED;
+ tm->efd.enabled |= VLIB_EFD_MONITOR_ENABLED;
+ } else if (unformat(input, "drop_all")) {
+ dm->efd.enabled |= DPDK_EFD_DROPALL_ENABLED;
+ } else if (unformat(input, "default")) {
+ dm->efd.enabled = (DPDK_EFD_DISCARD_ENABLED |
+ DPDK_EFD_MONITOR_ENABLED);
+ tm->efd.enabled = (VLIB_EFD_DISCARD_ENABLED |
+ VLIB_EFD_MONITOR_ENABLED);
+ } else {
+ return clib_error_return(0, "Usage: set efd enable [dpdk | "
+ "worker | monitor | drop_all | default]");
+ }
+ } else if (unformat(input, "disable")) {
+ if (unformat(input, "dpdk")) {
+ dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "worker")) {
+ tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED;
+ } else if (unformat(input, "monitor")) {
+ dm->efd.enabled &= ~DPDK_EFD_MONITOR_ENABLED;
+ tm->efd.enabled &= ~VLIB_EFD_MONITOR_ENABLED;
+ } else if (unformat(input, "drop_all")) {
+ dm->efd.enabled &= ~DPDK_EFD_DROPALL_ENABLED;
+ } else if (unformat(input, "all")) {
+ dm->efd.enabled = 0;
+ tm->efd.enabled = 0;
+ } else {
+ return clib_error_return(0, "Usage: set efd disable [dpdk | "
+ "worker | monitor | drop_all | all]");
+ }
+ } else if (unformat(input, "worker_queue_hi_thresh")) {
+ u32 mark;
+ if (unformat (input, "%u", &mark)) {
+ tm->efd.queue_hi_thresh = mark;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "dpdk_device_hi_thresh")) {
+ u32 thresh;
+ if (unformat (input, "%u", &thresh)) {
+ dm->efd.queue_hi_thresh = thresh;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "consec_full_frames_hi_thresh")) {
+ u32 thresh;
+ if (unformat (input, "%u", &thresh)) {
+ dm->efd.consec_full_frames_hi_thresh = thresh;
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ } else if (unformat(input, "ip-prec")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "ip", &tm->efd.ip_prec_bitmap));
+ } else if (unformat(input, "mpls-exp")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "mpls", &tm->efd.mpls_exp_bitmap));
+ } else if (unformat(input, "vlan-cos")) {
+ return (parse_op_and_prec(vm, input, cmd,
+ "vlan", &tm->efd.vlan_cos_bitmap));
+ } else if (unformat(input, "help")) {
+ vlib_cli_output(vm,
+ "Usage:\n"
+ " set efd enable <dpdk | worker | monitor | drop_all | default> |\n"
+ " set efd disable <dpdk | worker | monitor | drop_all | all> |\n"
+ " set efd <ip-prec | mpls-exp | vlan-cos> <ge | lt> <0-7>\n"
+ " set efd worker_queue_hi_thresh <0-32> |\n"
+ " set efd dpdk_device_hi_thresh <0-%d> |\n"
+ " set efd consec_full_frames_hi_thresh <count> |\n",
+ DPDK_NB_RX_DESC_10GE);
+ } else {
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (cmd_set_efd,static) = {
+ .path = "set efd",
+ .short_help = "set early-fast-discard commands",
+ .function = set_efd,
+};
+
+static clib_error_t *
+set_dpdk_if_desc (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_hw_interface_t * hw;
+ dpdk_device_t * xd;
+ u32 hw_if_index = (u32) ~0;
+ u32 nb_rx_desc = (u32) ~0;
+ u32 nb_tx_desc = (u32) ~0;
+ clib_error_t * rv;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "tx %d", &nb_tx_desc))
+ ;
+ else if (unformat (line_input, "rx %d", &nb_rx_desc))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return clib_error_return (0, "number of descriptors can be set only for "
+ "physical devices");
+
+ if ((nb_rx_desc == (u32) ~0 || nb_rx_desc == xd->nb_rx_desc) &&
+ (nb_tx_desc == (u32) ~0 || nb_tx_desc == xd->nb_tx_desc))
+ return clib_error_return (0, "nothing changed");
+
+ if (nb_rx_desc != (u32) ~0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ if (nb_tx_desc != (u32) ~0)
+ xd->nb_rx_desc = nb_rx_desc;
+
+ rv = dpdk_port_setup(dm, xd);
+
+ return rv < 0 ? rv : 0;
+}
+
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_desc,static) = {
+ .path = "set dpdk interface descriptors",
+ .short_help = "set dpdk interface descriptors <if-name> [rx <n>] [tx <n>]",
+ .function = set_dpdk_if_desc,
+};
+
+static clib_error_t *
+show_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_and_queue_t * dq;
+ int cpu;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_cli_output(vm, "All interfaces are handled by main thread");
+
+ for(cpu = 0; cpu < vec_len(dm->devices_by_cpu); cpu++)
+ {
+ if (vec_len(dm->devices_by_cpu[cpu]))
+ vlib_cli_output(vm, "Thread %u (%s at lcore %u):", cpu,
+ vlib_worker_threads[cpu].name,
+ vlib_worker_threads[cpu].dpdk_lcore_id);
+
+ vec_foreach(dq, dm->devices_by_cpu[cpu])
+ {
+ u32 hw_if_index = dm->devices[dq->device].vlib_hw_if_index;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface(dm->vnet_main, hw_if_index);
+ vlib_cli_output(vm, " %v queue %u", hi->name, dq->queue_id);
+ }
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (cmd_show_dpdk_if_placement,static) = {
+ .path = "show dpdk interface placement",
+ .short_help = "show dpdk interface placement",
+ .function = show_dpdk_if_placement,
+};
+
+static int
+dpdk_device_queue_sort(void * a1, void * a2)
+{
+ dpdk_device_and_queue_t * dq1 = a1;
+ dpdk_device_and_queue_t * dq2 = a2;
+
+ if (dq1->device > dq2->device)
+ return 1;
+ else if (dq1->device < dq2->device)
+ return -1;
+ else if (dq1->queue_id > dq2->queue_id)
+ return 1;
+ else if (dq1->queue_id < dq2->queue_id)
+ return -1;
+ else
+ return 0;
+}
+
+static clib_error_t *
+set_dpdk_if_placement (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_and_queue_t * dq;
+ vnet_hw_interface_t * hw;
+ dpdk_device_t * xd;
+ u32 hw_if_index = (u32) ~0;
+ u32 queue = (u32) 0;
+ u32 cpu = (u32) ~0;
+ int i;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%U", unformat_vnet_hw_interface, dm->vnet_main,
+ &hw_if_index))
+ ;
+ else if (unformat (line_input, "queue %d", &queue))
+ ;
+ else if (unformat (line_input, "thread %d", &cpu))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (hw_if_index == (u32) ~0)
+ return clib_error_return (0, "please specify valid interface name");
+
+ if (cpu < dm->input_cpu_first_index ||
+ cpu >= (dm->input_cpu_first_index + dm->input_cpu_count))
+ return clib_error_return (0, "please specify valid thread id");
+
+ hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index);
+ xd = vec_elt_at_index (dm->devices, hw->dev_instance);
+
+ for(i = 0; i < vec_len(dm->devices_by_cpu); i++)
+ {
+ vec_foreach(dq, dm->devices_by_cpu[i])
+ {
+ if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index &&
+ queue == dq->queue_id)
+ {
+ if (cpu == i) /* nothing to do */
+ return 0;
+
+ vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]);
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->queue_id = queue;
+ dq->device = xd->device_index;
+ xd->cpu_socket_id_by_queue[queue] =
+ rte_lcore_to_socket_id(vlib_worker_threads[cpu].dpdk_lcore_id);
+
+ vec_sort_with_function(dm->devices_by_cpu[i],
+ dpdk_device_queue_sort);
+
+ vec_sort_with_function(dm->devices_by_cpu[cpu],
+ dpdk_device_queue_sort);
+
+ if (vec_len(dm->devices_by_cpu[i]) == 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_DISABLED);
+
+ if (vec_len(dm->devices_by_cpu[cpu]) == 1)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ return 0;
+ }
+ }
+ }
+
+ return clib_error_return (0, "not found");
+}
+
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+ .path = "set dpdk interface placement",
+ .short_help = "set dpdk interface placement <if-name> [queue <n>] thread <n>",
+ .function = set_dpdk_if_placement,
+};
+
+clib_error_t *
+dpdk_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_cli_init);
diff --git a/vnet/vnet/devices/dpdk/device.c b/vnet/vnet/devices/dpdk/device.c
new file mode 100644
index 00000000000..a19c3131ef9
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/device.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/format.h>
+#include <vlib/unix/cj.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include "dpdk_priv.h"
+#include <vppinfra/error.h>
+
+#define foreach_dpdk_tx_func_error \
+ _(BAD_RETVAL, "DPDK tx function returned an error") \
+ _(RING_FULL, "Tx packet drops (ring full)") \
+ _(PKT_DROP, "Tx packet drops (dpdk tx failure)") \
+ _(REPL_FAIL, "Tx packet drops (replication failure)")
+
+typedef enum {
+#define _(f,s) DPDK_TX_FUNC_ERROR_##f,
+ foreach_dpdk_tx_func_error
+#undef _
+ DPDK_TX_FUNC_N_ERROR,
+} dpdk_tx_func_error_t;
+
+static char * dpdk_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_tx_func_error
+#undef _
+};
+
+static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ struct rte_mbuf * first_mb = 0, * new_mb, * pkt_mb, ** prev_mb_next = 0;
+ u8 nb_segs, nb_segs_left;
+ u32 copy_bytes;
+ unsigned socket_id = rte_socket_id();
+
+ ASSERT (bm->pktmbuf_pools[socket_id]);
+ pkt_mb = ((struct rte_mbuf *)b)-1;
+ nb_segs = pkt_mb->nb_segs;
+ for (nb_segs_left = nb_segs; nb_segs_left; nb_segs_left--)
+ {
+ if (PREDICT_FALSE(pkt_mb == 0))
+ {
+ clib_warning ("Missing %d mbuf chain segment(s): "
+ "(nb_segs = %d, nb_segs_left = %d)!",
+ nb_segs - nb_segs_left, nb_segs, nb_segs_left);
+ if (first_mb)
+ rte_pktmbuf_free(first_mb);
+ return NULL;
+ }
+ new_mb = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]);
+ if (PREDICT_FALSE(new_mb == 0))
+ {
+ if (first_mb)
+ rte_pktmbuf_free(first_mb);
+ return NULL;
+ }
+
+ /*
+ * Copy packet info into 1st segment.
+ */
+ if (first_mb == 0)
+ {
+ first_mb = new_mb;
+ rte_pktmbuf_pkt_len (first_mb) = pkt_mb->pkt_len;
+ first_mb->nb_segs = pkt_mb->nb_segs;
+ first_mb->port = pkt_mb->port;
+#ifdef DAW_FIXME // TX Offload support TBD
+ first_mb->vlan_macip = pkt_mb->vlan_macip;
+ first_mb->hash = pkt_mb->hash;
+ first_mb->ol_flags = pkt_mb->ol_flags
+#endif
+ }
+ else
+ {
+ ASSERT(prev_mb_next != 0);
+ *prev_mb_next = new_mb;
+ }
+
+ /*
+ * Copy packet segment data into new mbuf segment.
+ */
+ rte_pktmbuf_data_len (new_mb) = pkt_mb->data_len;
+ copy_bytes = pkt_mb->data_len + RTE_PKTMBUF_HEADROOM;
+ ASSERT(copy_bytes <= pkt_mb->buf_len);
+ memcpy(new_mb->buf_addr, pkt_mb->buf_addr, copy_bytes);
+
+ prev_mb_next = &new_mb->next;
+ pkt_mb = pkt_mb->next;
+ }
+
+ ASSERT(pkt_mb == 0);
+ __rte_mbuf_sanity_check(first_mb, 1);
+
+ return first_mb;
+}
+
+typedef struct {
+ u32 buffer_index;
+ u16 device_index;
+ u8 queue_index;
+ struct rte_mbuf mb;
+ /* Copy of VLIB buffer; packet data stored in pre_data. */
+ vlib_buffer_t buffer;
+} dpdk_tx_dma_trace_t;
+
+static void
+dpdk_tx_trace_buffer (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 buffer_index,
+ vlib_buffer_t * buffer)
+{
+ vlib_main_t * vm = vlib_get_main();
+ dpdk_tx_dma_trace_t * t0;
+ struct rte_mbuf * mb;
+
+ mb = ((struct rte_mbuf *)buffer)-1;
+
+ t0 = vlib_add_trace (vm, node, buffer, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = buffer_index;
+ memcpy (&t0->mb, mb, sizeof (t0->mb));
+ memcpy (&t0->buffer, buffer, sizeof (buffer[0]) - sizeof (buffer->pre_data));
+ memcpy (t0->buffer.pre_data, buffer->data + buffer->current_data,
+ sizeof (t0->buffer.pre_data));
+}
+
+/*
+ * This function calls the dpdk's tx_burst function to transmit the packets
+ * on the tx_vector. It manages a lock per-device if the device does not
+ * support multiple queues. It returns the number of packets untransmitted
+ * on the tx_vector. If all packets are transmitted (the normal case), the
+ * function returns 0.
+ *
+ * The tx_burst function may not be able to transmit all packets because the
+ * dpdk ring is full. If a flowcontrol callback function has been configured
+ * then the function simply returns. If no callback has been configured, the
+ * function will retry calling tx_burst with the remaining packets. This will
+ * continue until all packets are transmitted or tx_burst indicates no packets
+ * could be transmitted. (The caller can drop the remaining packets.)
+ *
+ * The function assumes there is at least one packet on the tx_vector.
+ */
+static_always_inline
+u32 tx_burst_vector_internal (vlib_main_t * vm,
+ dpdk_device_t * xd,
+ struct rte_mbuf ** tx_vector)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u32 n_packets;
+ u32 tx_head;
+ u32 tx_tail;
+ u32 n_retry;
+ int rv;
+ int queue_id;
+ tx_ring_hdr_t *ring;
+
+ ring = vec_header(tx_vector, sizeof(*ring));
+
+ n_packets = ring->tx_head - ring->tx_tail;
+
+ tx_head = ring->tx_head % DPDK_TX_RING_SIZE;
+
+ /*
+ * Ensure rte_eth_tx_burst is not called with 0 packets, which can lead to
+ * unpredictable results.
+ */
+ ASSERT(n_packets > 0);
+
+ /*
+ * Check for tx_vector overflow. If this fails it is a system configuration
+ * error. The ring should be sized big enough to handle the largest un-flowed
+ * off burst from a traffic manager. A larger size also helps performance
+ * a bit because it decreases the probability of having to issue two tx_burst
+ * calls due to a ring wrap.
+ */
+ ASSERT(n_packets < DPDK_TX_RING_SIZE);
+
+ /*
+ * If there is no flowcontrol callback, there is only temporary buffering
+ * on the tx_vector and so the tail should always be 0.
+ */
+ ASSERT(dm->flowcontrol_callback || ring->tx_tail == 0);
+
+ /*
+ * If there is a flowcontrol callback, don't retry any incomplete tx_bursts.
+ * Apply backpressure instead. If there is no callback, keep retrying until
+ * a tx_burst sends no packets. n_retry of 255 essentially means no retry
+ * limit.
+ */
+ n_retry = dm->flowcontrol_callback ? 0 : 255;
+
+ queue_id = vm->cpu_index;
+
+ do {
+ /* start the burst at the tail */
+ tx_tail = ring->tx_tail % DPDK_TX_RING_SIZE;
+
+ /*
+ * This device only supports one TX queue,
+ * and we're running multi-threaded...
+ */
+ if (PREDICT_FALSE(xd->lockp != 0))
+ {
+ queue_id = 0;
+ while (__sync_lock_test_and_set (xd->lockp, 1))
+ /* zzzz */;
+ }
+
+ if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH))
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_eth_tx_burst(xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ }
+ else
+ {
+ /*
+ * This can only happen if there is a flowcontrol callback.
+ * We need to split the transmit into two calls: one for
+ * the packets up to the wrap point, and one to continue
+ * at the start of the ring.
+ * Transmit pkts up to the wrap point.
+ */
+ rv = rte_eth_tx_burst(xd->device_index,
+ (uint16_t) queue_id,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ if (PREDICT_TRUE(rv > 0))
+ {
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ vring->n_since_last_int += rv;
+
+ if (vring->n_since_last_int > dm->vhost_coalesce_frames)
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ int c = rv;
+ while(c--)
+ rte_pktmbuf_free (tx_vector[tx_tail+c]);
+ }
+ }
+ else
+ {
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ rv = rte_vhost_enqueue_burst(&xd->vu_vhost_dev, VIRTIO_RXQ,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ if (PREDICT_TRUE(rv > 0))
+ {
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ vring->n_since_last_int += rv;
+
+ if (vring->n_since_last_int > dm->vhost_coalesce_frames)
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ int c = rv;
+ while(c--)
+ rte_pktmbuf_free (tx_vector[tx_tail+c]);
+ }
+
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ if (PREDICT_TRUE(tx_head > tx_tail))
+ {
+ /* no wrap, transmit in one burst */
+ rv = rte_kni_tx_burst(xd->kni,
+ &tx_vector[tx_tail],
+ (uint16_t) (tx_head-tx_tail));
+ }
+ else
+ {
+ /*
+ * This can only happen if there is a flowcontrol callback.
+ * We need to split the transmit into two calls: one for
+ * the packets up to the wrap point, and one to continue
+ * at the start of the ring.
+ * Transmit pkts up to the wrap point.
+ */
+ rv = rte_kni_tx_burst(xd->kni,
+ &tx_vector[tx_tail],
+ (uint16_t) (DPDK_TX_RING_SIZE - tx_tail));
+
+ /*
+ * If we transmitted everything we wanted, then allow 1 retry
+ * so we can try to transmit the rest. If we didn't transmit
+ * everything, stop now.
+ */
+ n_retry = (rv == DPDK_TX_RING_SIZE - tx_tail) ? 1 : 0;
+ }
+ }
+ else
+ {
+ ASSERT(0);
+ rv = 0;
+ }
+
+ if (PREDICT_FALSE(xd->lockp != 0))
+ *xd->lockp = 0;
+
+ if (PREDICT_FALSE(rv < 0))
+ {
+ // emit non-fatal message, bump counter
+ vnet_main_t * vnm = dm->vnet_main;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u32 node_index;
+
+ node_index = vec_elt_at_index(im->hw_interfaces,
+ xd->vlib_hw_if_index)->tx_node_index;
+
+ vlib_error_count (vm, node_index, DPDK_TX_FUNC_ERROR_BAD_RETVAL, 1);
+ clib_warning ("rte_eth_tx_burst[%d]: error %d", xd->device_index, rv);
+ return n_packets; // untransmitted packets
+ }
+ ring->tx_tail += (u16)rv;
+ n_packets -= (uint16_t) rv;
+ } while (rv && n_packets && (n_retry>0));
+
+ return n_packets;
+}
+
+
+/*
+ * This function transmits any packets on the interface's tx_vector and returns
+ * the number of packets untransmitted on the tx_vector. If the tx_vector is
+ * empty the function simply returns 0.
+ *
+ * It is intended to be called by a traffic manager which has flowed-off an
+ * interface to see if the interface can be flowed-on again.
+ */
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ int queue_id;
+ struct rte_mbuf ** tx_vector;
+ tx_ring_hdr_t *ring;
+
+ /* param is dev_instance and not hw_if_index to save another lookup */
+ xd = vec_elt_at_index (dm->devices, dev_instance);
+
+ queue_id = vm->cpu_index;
+ tx_vector = xd->tx_vectors[queue_id];
+
+ /* If no packets on the ring, don't bother calling tx function */
+ ring = vec_header(tx_vector, sizeof(*ring));
+ if (ring->tx_head == ring->tx_tail)
+ {
+ return 0;
+ }
+
+ return tx_burst_vector_internal (vm, xd, tx_vector);
+}
+
+/*
+ * Transmits the packets on the frame to the interface associated with the
+ * node. It first copies packets on the frame to a tx_vector containing the
+ * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal
+ * which calls the dpdk tx_burst function.
+ *
+ * The tx_vector is treated slightly differently depending on whether or
+ * not a flowcontrol callback function has been configured. If there is no
+ * callback, the tx_vector is a temporary array of rte_mbuf packet pointers.
+ * Its entries are written and consumed before the function exits.
+ *
+ * If there is a callback then the transmit is being invoked in the presence
+ * of a traffic manager. Here the tx_vector is treated like a ring of rte_mbuf
+ * pointers. If not all packets can be transmitted, the untransmitted packets
+ * stay on the tx_vector until the next call. The callback allows the traffic
+ * manager to flow-off dequeues to the interface. The companion function
+ * dpdk_interface_tx_vector() allows the traffic manager to detect when
+ * it should flow-on the interface again.
+ */
+static uword
+dpdk_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, rd->dev_instance);
+ u32 n_packets = f->n_vectors;
+ u32 n_left;
+ u32 * from;
+ struct rte_mbuf ** tx_vector;
+ int i;
+ int queue_id;
+ u32 my_cpu;
+ u32 tx_pkts = 0;
+ tx_ring_hdr_t *ring;
+ u32 n_on_ring;
+
+ my_cpu = vm->cpu_index;
+
+ queue_id = my_cpu;
+
+ tx_vector = xd->tx_vectors[queue_id];
+ ring = vec_header(tx_vector, sizeof(*ring));
+
+ n_on_ring = ring->tx_head - ring->tx_tail;
+ from = vlib_frame_vector_args (f);
+
+ ASSERT(n_packets <= VLIB_FRAME_SIZE);
+
+ if (PREDICT_FALSE(n_on_ring + n_packets > DPDK_TX_RING_SIZE))
+ {
+ /*
+ * Overflowing the ring should never happen.
+ * If it does then drop the whole frame.
+ */
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_RING_FULL,
+ n_packets);
+
+ while (n_packets--)
+ {
+ u32 bi0 = from[n_packets];
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
+ struct rte_mbuf *mb0 = ((struct rte_mbuf *)b0) - 1;
+ rte_pktmbuf_free (mb0);
+ }
+ return n_on_ring;
+ }
+
+ if (PREDICT_FALSE(dm->tx_pcap_enable))
+ {
+ n_left = n_packets;
+ while (n_left > 0)
+ {
+ u32 bi0 = from[0];
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, bi0);
+ if (dm->pcap_sw_if_index == 0 ||
+ dm->pcap_sw_if_index == vnet_buffer(b0)->sw_if_index [VLIB_TX])
+ pcap_add_buffer (&dm->pcap_main, vm, bi0, 512);
+ from++;
+ n_left--;
+ }
+ }
+
+ from = vlib_frame_vector_args (f);
+ n_left = n_packets;
+ i = ring->tx_head % DPDK_TX_RING_SIZE;
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ u32 pi0, pi1;
+ struct rte_mbuf * mb0, * mb1;
+ struct rte_mbuf * prefmb0, * prefmb1;
+ vlib_buffer_t * b0, * b1;
+ vlib_buffer_t * pref0, * pref1;
+ i16 delta0, delta1;
+ u16 new_data_len0, new_data_len1;
+ u16 new_pkt_len0, new_pkt_len1;
+ u32 any_clone;
+
+ pi0 = from[2];
+ pi1 = from[3];
+ pref0 = vlib_get_buffer (vm, pi0);
+ pref1 = vlib_get_buffer (vm, pi1);
+
+ prefmb0 = ((struct rte_mbuf *)pref0) - 1;
+ prefmb1 = ((struct rte_mbuf *)pref1) - 1;
+
+ CLIB_PREFETCH(prefmb0, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(pref0, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(prefmb1, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH(pref1, CLIB_CACHE_LINE_BYTES, LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ from += 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ mb0 = ((struct rte_mbuf *)b0) - 1;
+ mb1 = ((struct rte_mbuf *)b1) - 1;
+
+ any_clone = b0->clone_count | b1->clone_count;
+ if (PREDICT_FALSE(any_clone != 0))
+ {
+ if (PREDICT_FALSE(b0->clone_count != 0))
+ {
+ struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+ if (PREDICT_FALSE(mb0_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b0->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb0 = mb0_new;
+ vec_add1 (dm->recycle[my_cpu], bi0);
+ }
+ if (PREDICT_FALSE(b1->clone_count != 0))
+ {
+ struct rte_mbuf * mb1_new = dpdk_replicate_packet_mb (b1);
+ if (PREDICT_FALSE(mb1_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b1->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb1 = mb1_new;
+ vec_add1 (dm->recycle[my_cpu], bi1);
+ }
+ }
+
+ delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+ delta1 = PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b1) - (i16) mb1->pkt_len;
+
+ new_data_len0 = (u16)((i16) mb0->data_len + delta0);
+ new_data_len1 = (u16)((i16) mb1->data_len + delta1);
+ new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
+ new_pkt_len1 = (u16)((i16) mb1->pkt_len + delta1);
+
+ b0->current_length = new_data_len0;
+ b1->current_length = new_data_len1;
+ mb0->data_len = new_data_len0;
+ mb1->data_len = new_data_len1;
+ mb0->pkt_len = new_pkt_len0;
+ mb1->pkt_len = new_pkt_len1;
+
+ mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+ mb1->data_off = (PREDICT_FALSE(b1->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb1->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b1->current_data);
+
+ if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi1, b1);
+ }
+
+ if (PREDICT_TRUE(any_clone == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+ i++;
+ }
+ else
+ {
+ /* cloning was done, need to check for failure */
+ if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ }
+ if (PREDICT_TRUE((b1->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb1;
+ i++;
+ }
+ }
+
+ n_left -= 2;
+ }
+ while (n_left > 0)
+ {
+ u32 bi0;
+ struct rte_mbuf * mb0;
+ vlib_buffer_t * b0;
+ i16 delta0;
+ u16 new_data_len0;
+ u16 new_pkt_len0;
+
+ bi0 = from[0];
+ from++;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ mb0 = ((struct rte_mbuf *)b0) - 1;
+ if (PREDICT_FALSE(b0->clone_count != 0))
+ {
+ struct rte_mbuf * mb0_new = dpdk_replicate_packet_mb (b0);
+ if (PREDICT_FALSE(mb0_new == 0))
+ {
+ vlib_error_count (vm, node->node_index,
+ DPDK_TX_FUNC_ERROR_REPL_FAIL, 1);
+ b0->flags |= VLIB_BUFFER_REPL_FAIL;
+ }
+ else
+ mb0 = mb0_new;
+ vec_add1 (dm->recycle[my_cpu], bi0);
+ }
+
+ delta0 = PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL) ? 0 :
+ vlib_buffer_length_in_chain (vm, b0) - (i16) mb0->pkt_len;
+
+ new_data_len0 = (u16)((i16) mb0->data_len + delta0);
+ new_pkt_len0 = (u16)((i16) mb0->pkt_len + delta0);
+
+ b0->current_length = new_data_len0;
+ mb0->data_len = new_data_len0;
+ mb0->pkt_len = new_pkt_len0;
+ mb0->data_off = (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL)) ?
+ mb0->data_off : (u16)(RTE_PKTMBUF_HEADROOM + b0->current_data);
+
+ if (PREDICT_FALSE(node->flags & VLIB_NODE_FLAG_TRACE))
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ dpdk_tx_trace_buffer (dm, node, xd, queue_id, bi0, b0);
+
+ if (PREDICT_TRUE((b0->flags & VLIB_BUFFER_REPL_FAIL) == 0))
+ {
+ tx_vector[i % DPDK_TX_RING_SIZE] = mb0;
+ i++;
+ }
+ n_left--;
+ }
+
+ /* account for additional packets in the ring */
+ ring->tx_head += n_packets;
+ n_on_ring = ring->tx_head - ring->tx_tail;
+
+ /* transmit as many packets as possible */
+ n_packets = tx_burst_vector_internal (vm, xd, tx_vector);
+
+ /*
+ * tx_pkts is the number of packets successfully transmitted
+ * This is the number originally on ring minus the number remaining on ring
+ */
+ tx_pkts = n_on_ring - n_packets;
+
+ if (PREDICT_FALSE(dm->flowcontrol_callback != 0))
+ {
+ if (PREDICT_FALSE(n_packets))
+ {
+ /* Callback may want to enable flowcontrol */
+ dm->flowcontrol_callback(vm, xd->vlib_hw_if_index, ring->tx_head - ring->tx_tail);
+ }
+ else
+ {
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+ }
+ else
+ {
+ /* If there is no callback then drop any non-transmitted packets */
+ if (PREDICT_FALSE(n_packets))
+ {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, n_packets);
+
+ vlib_error_count (vm, node->node_index, DPDK_TX_FUNC_ERROR_PKT_DROP,
+ n_packets);
+
+ while (n_packets--)
+ rte_pktmbuf_free (tx_vector[ring->tx_tail + n_packets]);
+ }
+
+ /* Reset head/tail to avoid unnecessary wrap */
+ ring->tx_head = 0;
+ ring->tx_tail = 0;
+ }
+
+ /* Recycle replicated buffers */
+ if (PREDICT_FALSE(vec_len(dm->recycle[my_cpu])))
+ {
+ vlib_buffer_free (vm, dm->recycle[my_cpu], vec_len(dm->recycle[my_cpu]));
+ _vec_len(dm->recycle[my_cpu]) = 0;
+ }
+
+ ASSERT(ring->tx_head >= ring->tx_tail);
+
+ return tx_pkts;
+}
+
+static int dpdk_device_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ if (!xd || xd->dev_type != VNET_DPDK_DEV_VHOST_USER) {
+ clib_warning("cannot renumber non-vhost-user interface (sw_if_index: %d)",
+ hi->sw_if_index);
+ return 0;
+ }
+
+ xd->vu_if_id = new_dev_instance;
+ return 0;
+}
+
+static u8 * format_dpdk_device_name (u8 * s, va_list * args)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ char *devname_format;
+ char *device_name;
+ u32 i = va_arg (*args, u32);
+ struct rte_eth_dev_info dev_info;
+
+ if (dm->interface_name_format_decimal)
+ devname_format = "%s%d/%d/%d";
+ else
+ devname_format = "%s%x/%x/%x";
+
+ if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
+ return format(s, "kni%d", dm->devices[i].kni_port_id);
+ } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ return format(s, "VirtualEthernet0/0/%d", dm->devices[i].vu_if_id);
+ }
+ switch (dm->devices[i].port_type)
+ {
+ case VNET_DPDK_PORT_TYPE_ETH_1G:
+ device_name = "GigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_10G:
+ device_name = "TenGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_40G:
+ device_name = "FortyGigabitEthernet";
+ break;
+
+ case VNET_DPDK_PORT_TYPE_ETH_SWITCH:
+ device_name = "EthernetSwitch";
+ break;
+
+ #ifdef NETMAP
+ case VNET_DPDK_PORT_TYPE_NETMAP:
+ rte_eth_dev_info_get(i, &dev_info);
+ return format(s, "netmap:%s", dev_info.driver_name);
+ #endif
+
+ case VNET_DPDK_PORT_TYPE_AF_PACKET:
+ rte_eth_dev_info_get(i, &dev_info);
+ return format(s, "af_packet%d", dm->devices[i].af_packet_port_id);
+
+ default:
+ case VNET_DPDK_PORT_TYPE_UNKNOWN:
+ device_name = "UnknownEthernet";
+ break;
+ }
+
+ rte_eth_dev_info_get(i, &dev_info);
+ return format (s, devname_format, device_name, dev_info.pci_dev->addr.bus,
+ dev_info.pci_dev->addr.devid,
+ dev_info.pci_dev->addr.function);
+}
+
+static u8 * format_dpdk_device_type (u8 * s, va_list * args)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ char *dev_type;
+ u32 i = va_arg (*args, u32);
+
+ if (dm->devices[i].dev_type == VNET_DPDK_DEV_KNI) {
+ return format(s, "Kernel NIC Interface");
+ } else if (dm->devices[i].dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ return format(s, "vhost-user interface");
+ }
+
+ switch (dm->devices[i].pmd)
+ {
+ case VNET_DPDK_PMD_E1000EM:
+ dev_type = "Intel 82540EM (e1000)";
+ break;
+
+ case VNET_DPDK_PMD_IGB:
+ dev_type = "Intel e1000";
+ break;
+
+ case VNET_DPDK_PMD_I40E:
+ dev_type = "Intel X710/XL710 Family";
+ break;
+
+ case VNET_DPDK_PMD_I40EVF:
+ dev_type = "Intel X710/XL710 Family VF";
+ break;
+
+ case VNET_DPDK_PMD_FM10K:
+ dev_type = "Intel FM10000 Family Ethernet Switch";
+ break;
+
+ case VNET_DPDK_PMD_IGBVF:
+ dev_type = "Intel e1000 VF";
+ break;
+
+ case VNET_DPDK_PMD_VIRTIO:
+ dev_type = "Red Hat Virtio";
+ break;
+
+ case VNET_DPDK_PMD_IXGBEVF:
+ dev_type = "Intel 82599 VF";
+ break;
+
+ case VNET_DPDK_PMD_IXGBE:
+ dev_type = "Intel 82599";
+ break;
+
+ case VNET_DPDK_PMD_VICE:
+ case VNET_DPDK_PMD_ENIC:
+ dev_type = "Cisco VIC";
+ break;
+
+ case VNET_DPDK_PMD_VMXNET3:
+ dev_type = "VMware VMXNET3";
+ break;
+
+#ifdef NETMAP
+ case VNET_DPDK_PMD_NETMAP:
+ dev_type = "Netmap/Vale";
+ break;
+#endif
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ dev_type = "af_packet";
+ break;
+
+ default:
+ case VNET_DPDK_PMD_UNKNOWN:
+ dev_type = "### UNKNOWN ###";
+ break;
+ }
+
+ return format (s, dev_type);
+}
+
+static u8 * format_dpdk_link_status (u8 * s, va_list * args)
+{
+ dpdk_device_t * xd = va_arg (*args, dpdk_device_t *);
+ struct rte_eth_link * l = &xd->link;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ s = format (s, "%s ", l->link_status ? "up" : "down");
+ if (l->link_status)
+ {
+ u32 promisc = rte_eth_promiscuous_get (xd->device_index);
+
+ s = format (s, "%s duplex ", (l->link_duplex == ETH_LINK_FULL_DUPLEX) ?
+ "full" : "half");
+ s = format (s, "speed %u mtu %d %s\n", l->link_speed,
+ hi->max_packet_bytes, promisc ? " promisc" : "");
+ }
+ else
+ s = format (s, "\n");
+
+ return s;
+}
+
+#define _line_len 72
+#define _(v, str) \
+if (bitmap & v) { \
+ if (format_get_indent (s) > next_split ) { \
+ next_split += _line_len; \
+ s = format(s,"\n%U", format_white_space, indent); \
+ } \
+ s = format(s, "%s ", str); \
+}
+
+static u8 * format_dpdk_rss_hf_name(u8 * s, va_list * args)
+{
+ u64 bitmap = va_arg (*args, u64);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_rss_hf
+
+ return s;
+}
+
+static u8 * format_dpdk_rx_offload_caps(u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_rx_offload_caps
+
+ return s;
+}
+
+static u8 * format_dpdk_tx_offload_caps(u8 * s, va_list * args)
+{
+ u32 bitmap = va_arg (*args, u32);
+ int next_split = _line_len;
+ int indent = format_get_indent (s);
+ if (!bitmap)
+ return format(s, "none");
+
+ foreach_dpdk_tx_offload_caps
+
+ return s;
+}
+
+#undef _line_len
+#undef _
+
+static u8 * format_dpdk_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ int verbose = va_arg (*args, int);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, dev_instance);
+ uword indent = format_get_indent (s);
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+
+ s = format (s, "%U\n%Ucarrier %U",
+ format_dpdk_device_type, xd->device_index,
+ format_white_space, indent + 2,
+ format_dpdk_link_status, xd);
+
+ if (verbose > 1 && xd->dev_type == VNET_DPDK_DEV_ETH)
+ {
+ struct rte_eth_dev_info di;
+ struct rte_pci_device * pci;
+ struct rte_eth_rss_conf rss_conf;
+ int vlan_off;
+
+ rss_conf.rss_key = 0;
+ rte_eth_dev_info_get(xd->device_index, &di);
+ rte_eth_dev_rss_hash_conf_get(xd->device_index, &rss_conf);
+ pci = di.pci_dev;
+
+ s = format(s, "%Upci id: device %04x:%04x subsystem %04x:%04x\n"
+ "%Upci address: %04x:%02x:%02x.%02x\n",
+ format_white_space, indent + 2,
+ pci->id.vendor_id, pci->id.device_id,
+ pci->id.subsystem_vendor_id,
+ pci->id.subsystem_device_id,
+ format_white_space, indent + 2,
+ pci->addr.domain, pci->addr.bus,
+ pci->addr.devid, pci->addr.function);
+ s = format(s, "%Umax rx packet len: %d\n",
+ format_white_space, indent + 2, di.max_rx_pktlen);
+ s = format(s, "%Upromiscuous: unicast %s all-multicast %s\n",
+ format_white_space, indent + 2,
+ rte_eth_promiscuous_get(xd->device_index) ? "on" : "off",
+ rte_eth_promiscuous_get(xd->device_index) ? "on" : "off");
+ vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index);
+ s = format(s, "%Uvlan offload: strip %s filter %s qinq %s\n",
+ format_white_space, indent + 2,
+ vlan_off & ETH_VLAN_STRIP_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_FILTER_OFFLOAD ? "on" : "off",
+ vlan_off & ETH_VLAN_EXTEND_OFFLOAD ? "on" : "off");
+ s = format(s, "%Uqueue size (max): rx %d (%d) tx %d (%d)\n",
+ format_white_space, indent + 2,
+ xd->rx_q_used, di.max_rx_queues,
+ xd->tx_q_used, di.max_tx_queues);
+ s = format(s, "%Urx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rx_offload_caps, di.rx_offload_capa);
+ s = format(s, "%Utx offload caps: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_tx_offload_caps, di.tx_offload_capa);
+ s = format(s, "%Urss active: %U\n"
+ "%Urss supported: %U\n",
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, rss_conf.rss_hf,
+ format_white_space, indent + 2,
+ format_dpdk_rss_hf_name, di.flow_type_rss_offloads);
+ }
+
+ if (xd->cpu_socket > -1)
+ s = format (s, "%Ucpu socket %d",
+ format_white_space, indent + 2,
+ xd->cpu_socket);
+
+ /* $$$ MIB counters */
+
+ {
+#define _(N, V) \
+ if (xd->stats.V != 0) \
+ s = format (s, "\n%U%-40U%16Ld", \
+ format_white_space, indent + 2, \
+ format_c_identifier, #N, xd->stats.V);
+
+ foreach_dpdk_counter
+#undef _
+ }
+
+ u8 * xs = 0;
+ struct rte_eth_xstats * xstat;
+
+ vec_foreach(xstat, xd->xstats)
+ {
+ if (xstat->value)
+ {
+ /* format_c_identifier don't like c strings inside vector */
+ u8 * name = format(0,"%s", xstat->name);
+ xs = format(xs, "\n%U%-38U%16Ld",
+ format_white_space, indent + 4,
+ format_c_identifier, name, xstat->value);
+ vec_free(name);
+ }
+ }
+
+ if (xs)
+ {
+ s = format(s, "\n%Uextended stats:%v",
+ format_white_space, indent + 2, xs);
+ vec_free(xs);
+ }
+
+ return s;
+}
+
+static u8 * format_dpdk_tx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ dpdk_tx_dma_trace_t * t = va_arg (*va, dpdk_tx_dma_trace_t *);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index);
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U tx queue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index,
+ format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_ethernet_header_with_length, t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+static void dpdk_clear_hw_interface_counters (u32 instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, instance);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up != 0xff)
+ {
+ rte_eth_stats_reset (xd->device_index);
+ memset (&xd->last_stats, 0, sizeof (xd->last_stats));
+ dpdk_update_counters (xd, vlib_time_now (dm->vlib_main));
+ }
+ else
+ {
+ rte_eth_stats_reset (xd->device_index);
+ memset(&xd->stats, 0, sizeof(xd->stats));
+ memset (&xd->last_stats, 0, sizeof (xd->last_stats));
+ }
+ rte_eth_xstats_reset(xd->device_index);
+}
+
+static int
+kni_config_network_if(u8 port_id, u8 if_up)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword *p;
+
+ p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
+ if (p == 0) {
+ clib_warning("unknown interface");
+ return 0;
+ } else {
+ xd = vec_elt_at_index (dm->devices, p[0]);
+ }
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
+ if_up ? VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX : 0);
+ return 0;
+}
+
+static int
+kni_change_mtu(u8 port_id, unsigned new_mtu)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword *p;
+ vnet_hw_interface_t * hif;
+
+ p = hash_get (dm->dpdk_device_by_kni_port_id, port_id);
+ if (p == 0) {
+ clib_warning("unknown interface");
+ return 0;
+ } else {
+ xd = vec_elt_at_index (dm->devices, p[0]);
+ }
+ hif = vnet_get_hw_interface (vnm, xd->vlib_hw_if_index);
+
+ hif->max_packet_bytes = new_mtu;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hif->dev_instance);
+ int rv = 0;
+
+ if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ if (is_up)
+ {
+ struct rte_kni_conf conf;
+ struct rte_kni_ops ops;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ memset(&conf, 0, sizeof(conf));
+ snprintf(conf.name, RTE_KNI_NAMESIZE, "vpp%u", xd->kni_port_id);
+ conf.mbuf_size = MBUF_SIZE;
+ memset(&ops, 0, sizeof(ops));
+ ops.port_id = xd->kni_port_id;
+ ops.change_mtu = kni_change_mtu;
+ ops.config_network_if = kni_config_network_if;
+
+ xd->kni = rte_kni_alloc(bm->pktmbuf_pools[rte_socket_id()], &conf, &ops);
+ if (!xd->kni)
+ {
+ clib_warning("failed to allocate kni interface");
+ }
+ else
+ {
+ hif->max_packet_bytes = 1500; /* kni interface default value */
+ xd->admin_up = 1;
+ }
+ }
+ else
+ {
+ xd->admin_up = 0;
+ rte_kni_release(xd->kni);
+ }
+ return 0;
+ }
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ if (is_up)
+ {
+ if (xd->vu_is_running)
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX );
+ xd->admin_up = 1;
+ }
+ else
+ {
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+ xd->admin_up = 0;
+ }
+
+ return 0;
+ }
+
+
+ if (is_up)
+ {
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up == 0)
+ rv = rte_eth_dev_start (xd->device_index);
+
+ if (xd->promisc)
+ rte_eth_promiscuous_enable(xd->device_index);
+ else
+ rte_eth_promiscuous_disable(xd->device_index);
+
+ rte_eth_allmulticast_enable (xd->device_index);
+ xd->admin_up = 1;
+ dpdk_update_counters (xd, now);
+ dpdk_update_link_state (xd, now);
+ }
+ else
+ {
+ rte_eth_allmulticast_disable (xd->device_index);
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->pmd != VNET_DPDK_PMD_VMXNET3)
+ {
+ rte_eth_dev_stop (xd->device_index);
+ xd->admin_up = 0;
+ }
+ else
+ xd->admin_up = ~0;
+ }
+
+ if (rv < 0)
+ clib_warning ("rte_eth_dev_%s error: %d", is_up ? "start" : "stop",
+ rv);
+
+ return /* no error */ 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void dpdk_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ dpdk_main_t * xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ xd->per_interface_next_index = node_index;
+ return;
+ }
+
+ xd->per_interface_next_index =
+ vlib_node_add_next (xm->vlib_main, dpdk_input_node.index, node_index);
+}
+
+
+static clib_error_t *
+dpdk_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t * st,
+ int is_add)
+{
+ dpdk_main_t * xm = &dpdk_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (xm->devices, hw->dev_instance);
+ vnet_sw_interface_t * t = (vnet_sw_interface_t *) st;
+ int r, vlan_offload;
+
+
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return 0;
+ /* currently we program VLANS only for IXGBE VF */
+ if (xd->pmd != VNET_DPDK_PMD_IXGBEVF)
+ return 0;
+
+ if (t->sub.eth.flags.no_tags == 1)
+ return 0;
+
+ if ((t->sub.eth.flags.one_tag != 1) || (t->sub.eth.flags.exact_match != 1 ))
+ return clib_error_return (0, "unsupported VLAN setup");
+
+
+ vlan_offload = rte_eth_dev_get_vlan_offload(xd->device_index);
+ vlan_offload |= ETH_VLAN_FILTER_OFFLOAD;
+
+ if ((r = rte_eth_dev_set_vlan_offload(xd->device_index, vlan_offload)))
+ return clib_error_return (0, "rte_eth_dev_set_vlan_offload[%d]: err %d",
+ xd->device_index, r);
+
+
+ if ((r = rte_eth_dev_vlan_filter(xd->device_index, t->sub.eth.outer_vlan_id, is_add)))
+ return clib_error_return (0, "rte_eth_dev_vlan_filter[%d]: err %d",
+ xd->device_index, r);
+
+ return 0;
+}
+
+VNET_DEVICE_CLASS (dpdk_device_class) = {
+ .name = "dpdk",
+ .tx_function = dpdk_interface_tx,
+ .tx_function_n_errors = DPDK_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = dpdk_tx_func_error_strings,
+ .format_device_name = format_dpdk_device_name,
+ .format_device = format_dpdk_device,
+ .format_tx_trace = format_dpdk_tx_dma_trace,
+ .clear_counters = dpdk_clear_hw_interface_counters,
+ .admin_up_down_function = dpdk_interface_admin_up_down,
+ .subif_add_del_function = dpdk_subif_add_del_function,
+ .rx_redirect_to_node = dpdk_set_interface_next_node,
+ .no_flatten_output_chains = 1,
+ .name_renumber = dpdk_device_renumber,
+};
+
+void dpdk_set_flowcontrol_callback (vlib_main_t *vm,
+ dpdk_flowcontrol_callback_t callback)
+{
+ dpdk_main.flowcontrol_callback = callback;
+}
+
+#define UP_DOWN_FLAG_EVENT 1
+
+
+u32 dpdk_get_admin_up_down_in_progress (void)
+{
+ return dpdk_main.admin_up_down_in_progress;
+}
+
+static uword
+admin_up_down_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ clib_error_t * error = 0;
+ uword event_type;
+ uword *event_data = 0;
+ u32 index;
+ u32 sw_if_index;
+ u32 flags;
+
+ while (1)
+ {
+ vlib_process_wait_for_event (vm);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ dpdk_main.admin_up_down_in_progress = 1;
+
+ for (index=0; index<vec_len(event_data); index++)
+ {
+ sw_if_index = event_data[index] >> 32;
+ flags = (u32) event_data[index];
+
+ switch (event_type) {
+ case UP_DOWN_FLAG_EVENT:
+ error = vnet_sw_interface_set_flags (vnet_get_main(), sw_if_index, flags);
+ clib_error_report(error);
+ break;
+ }
+ }
+
+ vec_reset_length (event_data);
+
+ dpdk_main.admin_up_down_in_progress = 0;
+
+ }
+ return 0; /* or not */
+}
+
+VLIB_REGISTER_NODE (admin_up_down_process_node,static) = {
+ .function = admin_up_down_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "admin-up-down-process",
+ .process_log2_n_stack_bytes = 17, // 256KB
+};
+
+/*
+ * Asynchronously invoke vnet_sw_interface_set_flags via the admin_up_down
+ * process. Useful for avoiding long blocking delays (>150ms) in the dpdk
+ * drivers.
+ * WARNING: when posting this event, no other interface-related calls should
+ * be made (e.g. vnet_create_sw_interface()) while the event is being
+ * processed (admin_up_down_in_progress). This is required in order to avoid
+ * race conditions in manipulating interface data structures.
+ */
+void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags)
+{
+ vlib_process_signal_event
+ (vm, admin_up_down_process_node.index,
+ UP_DOWN_FLAG_EVENT,
+ (((uword)sw_if_index << 32) | flags));
+}
+
+/*
+ * Called by the dpdk driver's rte_delay_us() function.
+ * Return 0 to have the dpdk do a regular delay loop.
+ * Return 1 if to skip the delay loop because we are suspending
+ * the calling vlib process instead.
+ */
+int rte_delay_us_override (unsigned us) {
+ vlib_main_t * vm;
+
+ /* Don't bother intercepting for short delays */
+ if (us < 10) return 0;
+
+ /*
+ * Only intercept if we are in a vlib process.
+ * If we are called from a vlib worker thread or the vlib main
+ * thread then do not intercept. (Must not be called from an
+ * independent pthread).
+ */
+ if (os_get_cpu_number() == 0)
+ {
+ /*
+ * We're in the vlib main thread or a vlib process. Make sure
+ * the process is running and we're not still initializing.
+ */
+ vm = vlib_get_main();
+ if (vlib_in_process_context(vm))
+ {
+ /* Only suspend for the admin_down_process */
+ vlib_process_t * proc = vlib_get_current_process(vm);
+ if (!(proc->flags & VLIB_PROCESS_IS_RUNNING) ||
+ (proc->node_runtime.function != admin_up_down_process))
+ return 0;
+
+ f64 delay = 1e-6 * us;
+ vlib_process_suspend(vm, delay);
+ return 1;
+ }
+ }
+ return 0; // no override
+}
diff --git a/vnet/vnet/devices/dpdk/dpdk.h b/vnet/vnet/devices/dpdk/dpdk.h
new file mode 100644
index 00000000000..fd984e4d4df
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/dpdk.h
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_h__
+#define __included_dpdk_h__
+
+/* $$$$ We should rename always_inline -> clib_always_inline */
+#undef always_inline
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_dev.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memcpy.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_kni.h>
+#include <rte_virtio_net.h>
+#include <rte_pci_dev_ids.h>
+#include <rte_version.h>
+
+#include <vnet/unix/pcap.h>
+#include <vnet/devices/virtio/vhost-user.h>
+
+#if CLIB_DEBUG > 0
+#define always_inline static inline
+#else
+#define always_inline static inline __attribute__ ((__always_inline__))
+#endif
+
+#define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
+#define NB_MBUF (32<<10)
+
+vnet_device_class_t dpdk_device_class;
+vlib_node_registration_t dpdk_input_node;
+vlib_node_registration_t dpdk_io_input_node;
+vlib_node_registration_t handoff_dispatch_node;
+
+typedef enum {
+ VNET_DPDK_DEV_ETH = 1, /* Standard DPDK PMD driver */
+ VNET_DPDK_DEV_KNI, /* Kernel NIC Interface */
+ VNET_DPDK_DEV_VHOST_USER,
+ VNET_DPDK_DEV_UNKNOWN, /* must be last */
+} dpdk_device_type_t;
+
+#define foreach_dpdk_pmd \
+ _ ("rte_em_pmd", E1000EM) \
+ _ ("rte_igb_pmd", IGB) \
+ _ ("rte_igbvf_pmd", IGBVF) \
+ _ ("rte_ixgbe_pmd", IXGBE) \
+ _ ("rte_ixgbevf_pmd", IXGBEVF) \
+ _ ("rte_i40e_pmd", I40E) \
+ _ ("rte_i40evf_pmd", I40EVF) \
+ _ ("rte_virtio_pmd", VIRTIO) \
+ _ ("rte_vice_pmd", VICE) \
+ _ ("rte_enic_pmd", ENIC) \
+ _ ("rte_vmxnet3_pmd", VMXNET3) \
+ _ ("AF_PACKET PMD", AF_PACKET) \
+ _ ("rte_pmd_fm10k", FM10K)
+
+typedef enum {
+ VNET_DPDK_PMD_NONE,
+#define _(s,f) VNET_DPDK_PMD_##f,
+ foreach_dpdk_pmd
+#undef _
+#ifdef NETMAP
+ VNET_DPDK_PMD_NETMAP,
+#endif
+ VNET_DPDK_PMD_UNKNOWN, /* must be last */
+} dpdk_pmd_t;
+
+typedef enum {
+ VNET_DPDK_PORT_TYPE_ETH_1G,
+ VNET_DPDK_PORT_TYPE_ETH_10G,
+ VNET_DPDK_PORT_TYPE_ETH_40G,
+ VNET_DPDK_PORT_TYPE_ETH_SWITCH,
+#ifdef NETMAP
+ VNET_DPDK_PORT_TYPE_NETMAP,
+#endif
+ VNET_DPDK_PORT_TYPE_AF_PACKET,
+ VNET_DPDK_PORT_TYPE_UNKNOWN,
+} dpdk_port_type_t;
+
+typedef struct {
+ f64 deadline;
+ vlib_frame_t * frame;
+} dpdk_frame_t;
+
+#define DPDK_EFD_MAX_DISCARD_RATE 10
+
+typedef struct {
+ u16 last_burst_sz;
+ u16 max_burst_sz;
+ u32 full_frames_cnt;
+ u32 consec_full_frames_cnt;
+ u32 congestion_cnt;
+ u64 last_poll_time;
+ u64 max_poll_delay;
+ u32 discard_cnt;
+ u32 total_packet_cnt;
+} dpdk_efd_agent_t;
+
+typedef struct {
+ int callfd;
+ int kickfd;
+ int errfd;
+ u32 callfd_idx;
+ u32 n_since_last_int;
+ f64 int_deadline;
+} dpdk_vu_vring;
+
+typedef struct {
+ u32 is_up;
+ u32 unix_fd;
+ u32 unix_file_index;
+ u32 client_fd;
+ char sock_filename[256];
+ int sock_errno;
+ u8 sock_is_server;
+ u8 active;
+
+ u64 feature_mask;
+ u32 num_vrings;
+ dpdk_vu_vring vrings[2];
+ u64 region_addr[VHOST_MEMORY_MAX_NREGIONS];
+ u32 region_fd[VHOST_MEMORY_MAX_NREGIONS];
+} dpdk_vu_intf_t;
+
+typedef void (*dpdk_flowcontrol_callback_t) (vlib_main_t *vm,
+ u32 hw_if_index,
+ u32 n_packets);
+
+/*
+ * The header for the tx_vector in dpdk_device_t.
+ * Head and tail are indexes into the tx_vector and are of type
+ * u64 so they never overflow.
+ */
+typedef struct {
+ u64 tx_head;
+ u64 tx_tail;
+} tx_ring_hdr_t;
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ volatile u32 *lockp;
+
+ /* Instance ID */
+ u32 device_index;
+
+ u32 vlib_hw_if_index;
+ u32 vlib_sw_if_index;
+
+ /* next node index if we decide to steal the rx graph arc */
+ u32 per_interface_next_index;
+
+ /* dpdk rte_mbuf rx and tx vectors, VLIB_FRAME_SIZE */
+ struct rte_mbuf *** tx_vectors; /* one per worker thread */
+ struct rte_mbuf *** rx_vectors;
+
+ /* vector of traced contexts, per device */
+ u32 * d_trace_buffers;
+
+ /* per-worker destination frame queue */
+ dpdk_frame_t * frames;
+
+ dpdk_device_type_t dev_type:8;
+ dpdk_pmd_t pmd:8;
+ i8 cpu_socket;
+
+ u8 admin_up;
+ u8 promisc;
+
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline1);
+
+ /* PMD related */
+ u16 tx_q_used;
+ u16 rx_q_used;
+ u16 nb_rx_desc;
+ u16 nb_tx_desc;
+ u16 * cpu_socket_id_by_queue;
+ struct rte_eth_conf port_conf;
+ struct rte_eth_txconf tx_conf;
+
+ /* KNI related */
+ struct rte_kni *kni;
+ u8 kni_port_id;
+
+ /* vhost-user related */
+ u32 vu_if_id;
+ struct virtio_net vu_vhost_dev;
+ u32 vu_is_running;
+ dpdk_vu_intf_t *vu_intf;
+
+ /* af_packet */
+ u8 af_packet_port_id;
+
+ struct rte_eth_link link;
+ f64 time_last_link_update;
+
+ struct rte_eth_stats stats;
+ struct rte_eth_stats last_stats;
+ struct rte_eth_xstats * xstats;
+ f64 time_last_stats_update;
+ dpdk_port_type_t port_type;
+
+ dpdk_efd_agent_t efd_agent;
+} dpdk_device_t;
+
+#define MAX_NELTS 32
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ u64 head;
+ u64 head_hint;
+ u64 tail;
+ u32 n_in_use;
+ u32 nelts;
+ u32 written;
+ u32 threshold;
+ i32 n_vectors[MAX_NELTS];
+} frame_queue_trace_t;
+
+#define DPDK_TX_RING_SIZE (4 * 1024)
+
+#define DPDK_STATS_POLL_INTERVAL 10.0
+#define DPDK_LINK_POLL_INTERVAL 3.0
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+
+ /* total input packet counter */
+ u64 aggregate_rx_packets;
+} dpdk_worker_t;
+
+typedef struct {
+ u32 device;
+ u16 queue_id;
+} dpdk_device_and_queue_t;
+
+/* Early-Fast-Discard (EFD) */
+#define DPDK_EFD_DISABLED 0
+#define DPDK_EFD_DISCARD_ENABLED (1 << 0)
+#define DPDK_EFD_MONITOR_ENABLED (1 << 1)
+#define DPDK_EFD_DROPALL_ENABLED (1 << 2)
+
+#define DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT 90
+#define DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH 6
+
+typedef struct dpdk_efd_t {
+ u16 enabled;
+ u16 queue_hi_thresh;
+ u16 consec_full_frames_hi_thresh;
+ u16 pad;
+} dpdk_efd_t;
+
+typedef struct {
+
+ /* Devices */
+ dpdk_device_t * devices;
+ dpdk_device_and_queue_t ** devices_by_cpu;
+
+ /* per-thread recycle lists */
+ u32 ** recycle;
+
+ /* flow control callback. If 0 then flow control is disabled */
+ dpdk_flowcontrol_callback_t flowcontrol_callback;
+
+ /* vlib buffer free list, must be same size as an rte_mbuf */
+ u32 vlib_buffer_free_list_index;
+
+ /*
+ * format interface names ala xxxEthernet%d/%d/%d instead of
+ * xxxEthernet%x/%x/%x. For VIRL.
+ */
+ u8 interface_name_format_decimal;
+
+
+ /* dpdk worker "threads" */
+ dpdk_worker_t * workers;
+
+ /* Config stuff */
+ u8 ** eal_init_args;
+ u8 * eth_if_blacklist;
+ u8 * eth_if_whitelist;
+ u8 * uio_driver_name;
+ u8 no_multi_seg;
+
+ /* Required config parameters */
+ u8 coremask_set_manually;
+ u8 nchannels_set_manually;
+ u32 coremask;
+ u32 nchannels;
+ u32 num_mbufs;
+ u32 use_rss;
+ u8 num_kni; /* while kni_init allows u32, port_id in callback fn is only u8 */
+
+ /* Ethernet input node index */
+ u32 ethernet_input_node_index;
+
+ /* dpdk i/o thread initialization barrier */
+ volatile u32 io_thread_release;
+
+ /* pcap tracing [only works if (CLIB_DEBUG > 0)] */
+ int tx_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 * pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+
+ /* virtio vhost-user switch */
+ u8 use_virtio_vhost;
+
+ /* vhost-user coalescence frames config */
+ u32 vhost_coalesce_frames;
+ f64 vhost_coalesce_time;
+
+ /* hashes */
+ uword * dpdk_device_by_kni_port_id;
+ uword * vu_sw_if_index_by_listener_fd;
+ uword * vu_sw_if_index_by_sock_fd;
+ u32 * vu_inactive_interfaces_device_index;
+
+ u32 next_vu_if_id;
+
+ /* efd (early-fast-discard) settings */
+ dpdk_efd_t efd;
+
+ /*
+ * flag indicating that a posted admin up/down
+ * (via post_sw_interface_set_flags) is in progress
+ */
+ u8 admin_up_down_in_progress;
+
+ u8 have_io_threads;
+
+ /* which cpus are running dpdk-input */
+ int input_cpu_first_index;
+ int input_cpu_count;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} dpdk_main_t;
+
+dpdk_main_t dpdk_main;
+
+typedef enum {
+ DPDK_RX_NEXT_IP4_INPUT,
+ DPDK_RX_NEXT_IP6_INPUT,
+ DPDK_RX_NEXT_MPLS_INPUT,
+ DPDK_RX_NEXT_ETHERNET_INPUT,
+ DPDK_RX_NEXT_DROP,
+ DPDK_RX_N_NEXT,
+} dpdk_rx_next_t;
+
+void vnet_buffer_needs_dpdk_mb (vlib_buffer_t * b);
+
+void dpdk_set_next_node (dpdk_rx_next_t, char *);
+
+typedef void (*dpdk_io_thread_callback_t) (vlib_main_t *vm);
+
+void dpdk_io_thread (vlib_worker_thread_t * w,
+ u32 instances,
+ u32 instance_id,
+ char *worker_name,
+ dpdk_io_thread_callback_t callback);
+void dpdk_thread_input (dpdk_main_t * dm, dpdk_device_t * xd);
+
+clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd);
+
+void dpdk_set_flowcontrol_callback (vlib_main_t *vm,
+ dpdk_flowcontrol_callback_t callback);
+
+u32 dpdk_interface_tx_vector (vlib_main_t * vm, u32 dev_instance);
+
+vlib_frame_queue_elt_t * vlib_get_handoff_queue_elt (u32 vlib_worker_index);
+
+u32 dpdk_get_handoff_node_index (void);
+
+void set_efd_bitmap (u8 *bitmap, u32 value, u32 op);
+
+#define foreach_dpdk_error \
+ _(NONE, "no error") \
+ _(RX_PACKET_ERROR, "Rx packet errors") \
+ _(RX_BAD_FCS, "Rx bad fcs") \
+ _(L4_CHECKSUM_ERROR, "Rx L4 checksum errors") \
+ _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \
+ _(RX_ALLOC_FAIL, "rx buf alloc from free list failed") \
+ _(RX_ALLOC_NO_PHYSMEM, "rx buf alloc failed no physmem") \
+ _(RX_ALLOC_DROP_PKTS, "rx packets dropped due to alloc error") \
+ _(IPV4_EFD_DROP_PKTS, "IPV4 Early Fast Discard rx drops") \
+ _(IPV6_EFD_DROP_PKTS, "IPV6 Early Fast Discard rx drops") \
+ _(MPLS_EFD_DROP_PKTS, "MPLS Early Fast Discard rx drops") \
+ _(VLAN_EFD_DROP_PKTS, "VLAN Early Fast Discard rx drops")
+
+typedef enum {
+#define _(f,s) DPDK_ERROR_##f,
+ foreach_dpdk_error
+#undef _
+ DPDK_N_ERROR,
+} dpdk_error_t;
+
+/*
+ * Increment EFD drop counter
+ */
+static_always_inline
+void increment_efd_drop_counter (vlib_main_t * vm, u32 counter_index, u32 count)
+{
+ vlib_node_t *my_n;
+
+ my_n = vlib_get_node (vm, dpdk_input_node.index);
+ vm->error_main.counters[my_n->error_heap_index+counter_index] += count;
+}
+
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now);
+void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled);
+u32 is_efd_discardable(vlib_thread_main_t *tm,
+ vlib_buffer_t * b0,
+ struct rte_mbuf *mb);
+
+/* dpdk vhost-user interrupt management */
+u8 dpdk_vhost_user_want_interrupt (dpdk_device_t *xd, int idx);
+void dpdk_vhost_user_send_interrupt (vlib_main_t * vm, dpdk_device_t * xd,
+ int idx);
+
+
+static inline u64 vnet_get_aggregate_rx_packets (void)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u64 sum = 0;
+ dpdk_worker_t * dw;
+
+ vec_foreach(dw, dm->workers)
+ sum += dw->aggregate_rx_packets;
+
+ return sum;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 * buffers,
+ uword n_buffers);
+
+#define EFD_OPERATION_LESS_THAN 0
+#define EFD_OPERATION_GREATER_OR_EQUAL 1
+
+void efd_config(u32 enabled,
+ u32 ip_prec, u32 ip_op,
+ u32 mpls_exp, u32 mpls_op,
+ u32 vlan_cos, u32 vlan_op);
+
+void post_sw_interface_set_flags (vlib_main_t *vm, u32 sw_if_index, u32 flags);
+
+typedef struct vhost_user_memory vhost_user_memory_t;
+
+void dpdk_vhost_user_process_init (void **ctx);
+void dpdk_vhost_user_process_cleanup (void *ctx);
+uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx);
+
+// vhost-user calls
+int dpdk_vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int dpdk_vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int dpdk_vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index);
+int dpdk_vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t **out_vuids);
+
+u32 dpdk_get_admin_up_down_in_progress (void);
+
+uword
+dpdk_input_rss (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f);
+
+#endif /* __included_dpdk_h__ */
diff --git a/vnet/vnet/devices/dpdk/dpdk_priv.h b/vnet/vnet/devices/dpdk/dpdk_priv.h
new file mode 100644
index 00000000000..e452e02d90d
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/dpdk_priv.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define DPDK_NB_RX_DESC_DEFAULT 512
+#define DPDK_NB_TX_DESC_DEFAULT 512
+#define DPDK_NB_RX_DESC_VIRTIO 256
+#define DPDK_NB_TX_DESC_VIRTIO 256
+#define DPDK_NB_RX_DESC_10GE 2048
+#define DPDK_NB_TX_DESC_10GE 2048
+#define DPDK_NB_RX_DESC_40GE (4096-128)
+#define DPDK_NB_TX_DESC_40GE 2048
+
+#define foreach_dpdk_counter \
+ _ (tx_frames_ok, opackets) \
+ _ (tx_bytes_ok, obytes) \
+ _ (tx_errors, oerrors) \
+ _ (tx_loopback_frames_ok, olbpackets) \
+ _ (tx_loopback_bytes_ok, olbbytes) \
+ _ (rx_frames_ok, ipackets) \
+ _ (rx_bytes_ok, ibytes) \
+ _ (rx_errors, ierrors) \
+ _ (rx_missed, imissed) \
+ _ (rx_bad_crc, ibadcrc) \
+ _ (rx_bad_length, ibadlen) \
+ _ (rx_multicast_frames_ok, imcasts) \
+ _ (rx_no_bufs, rx_nombuf) \
+ _ (rx_filter_match, fdirmatch) \
+ _ (rx_filter_miss, fdirmiss) \
+ _ (tx_pause_xon, tx_pause_xon) \
+ _ (rx_pause_xon, rx_pause_xon) \
+ _ (tx_pause_xoff, tx_pause_xoff) \
+ _ (rx_pause_xoff, rx_pause_xoff) \
+ _ (rx_loopback_frames_ok, ilbpackets) \
+ _ (rx_loopback_bytes_ok, ilbbytes)
+
+#define foreach_dpdk_q_counter \
+ _ (rx_frames_ok, q_ipackets) \
+ _ (tx_frames_ok, q_opackets) \
+ _ (rx_bytes_ok, q_ibytes) \
+ _ (tx_bytes_ok, q_obytes) \
+ _ (rx_errors, q_errors)
+
+#define foreach_dpdk_rss_hf \
+ _(ETH_RSS_IPV4, "ipv4") \
+ _(ETH_RSS_FRAG_IPV4, "ipv4-frag") \
+ _(ETH_RSS_NONFRAG_IPV4_TCP, "ipv4-tcp") \
+ _(ETH_RSS_NONFRAG_IPV4_UDP, "ipv4-udp") \
+ _(ETH_RSS_NONFRAG_IPV4_SCTP, "ipv4-sctp") \
+ _(ETH_RSS_NONFRAG_IPV4_OTHER, "ipv4-other") \
+ _(ETH_RSS_IPV6, "ipv6") \
+ _(ETH_RSS_FRAG_IPV6, "ipv6-frag") \
+ _(ETH_RSS_NONFRAG_IPV6_TCP, "ipv6-tcp") \
+ _(ETH_RSS_NONFRAG_IPV6_UDP, "ipv6-udp") \
+ _(ETH_RSS_NONFRAG_IPV6_SCTP, "ipv6-sctp") \
+ _(ETH_RSS_NONFRAG_IPV6_OTHER, "ipv6-other") \
+ _(ETH_RSS_L2_PAYLOAD, "l2-payload") \
+ _(ETH_RSS_IPV6_EX, "ipv6-ex") \
+ _(ETH_RSS_IPV6_TCP_EX, "ipv6-tcp-ex") \
+ _(ETH_RSS_IPV6_UDP_EX, "ipv6-udp-ex")
+
+#define foreach_dpdk_rx_offload_caps \
+ _(DEV_RX_OFFLOAD_VLAN_STRIP, "vlan-strip") \
+ _(DEV_RX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_RX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_RX_OFFLOAD_TCP_LRO , "rcp-lro") \
+ _(DEV_RX_OFFLOAD_QINQ_STRIP, "qinq-strip")
+
+#define foreach_dpdk_tx_offload_caps \
+ _(DEV_TX_OFFLOAD_VLAN_INSERT, "vlan-insert") \
+ _(DEV_TX_OFFLOAD_IPV4_CKSUM, "ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_UDP_CKSUM , "udp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_CKSUM , "tcp-cksum") \
+ _(DEV_TX_OFFLOAD_SCTP_CKSUM , "sctp-cksum") \
+ _(DEV_TX_OFFLOAD_TCP_TSO , "tcp-tso") \
+ _(DEV_TX_OFFLOAD_UDP_TSO , "udp-tso") \
+ _(DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM, "outer-ipv4-cksum") \
+ _(DEV_TX_OFFLOAD_QINQ_INSERT, "qinq-insert")
+
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet")
+
+#define foreach_dpdk_pkt_type \
+ _ (RTE_PTYPE_L3_IPV4, "Packet with IPv4 header") \
+ _ (RTE_PTYPE_L3_IPV4_EXT, "Packet with extended IPv4 header") \
+ _ (RTE_PTYPE_L3_IPV6, "Packet with IPv6 header") \
+ _ (RTE_PTYPE_L3_IPV6_EXT, "Packet with extended IPv6 header")
+#else
+#define foreach_dpdk_pkt_rx_offload_flag \
+ _ (PKT_RX_VLAN_PKT, "RX packet is a 802.1q VLAN packet") \
+ _ (PKT_RX_RSS_HASH, "RX packet with RSS hash result") \
+ _ (PKT_RX_FDIR, "RX packet with FDIR infos") \
+ _ (PKT_RX_L4_CKSUM_BAD, "L4 cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IP_CKSUM_BAD, "IP cksum of RX pkt. is not OK") \
+ _ (PKT_RX_IPV4_HDR, "RX packet with IPv4 header") \
+ _ (PKT_RX_IPV4_HDR_EXT, "RX packet with extended IPv4 header") \
+ _ (PKT_RX_IPV6_HDR, "RX packet with IPv6 header") \
+ _ (PKT_RX_IPV6_HDR_EXT, "RX packet with extended IPv6 header") \
+ _ (PKT_RX_IEEE1588_PTP, "RX IEEE1588 L2 Ethernet PT Packet") \
+ _ (PKT_RX_IEEE1588_TMST, "RX IEEE1588 L2/L4 timestamped packet")
+
+#define foreach_dpdk_pkt_type /* Dummy */
+#endif /* RTE_VERSION */
+
+#define foreach_dpdk_pkt_tx_offload_flag \
+ _ (PKT_TX_VLAN_PKT, "TX packet is a 802.1q VLAN packet") \
+ _ (PKT_TX_IP_CKSUM, "IP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_TCP_CKSUM, "TCP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_SCTP_CKSUM, "SCTP cksum of TX pkt. computed by NIC") \
+ _ (PKT_TX_IEEE1588_TMST, "TX IEEE1588 packet to timestamp")
+
+#define foreach_dpdk_pkt_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_tx_offload_flag
+
+static inline u8 * format_dpdk_pkt_types (u8 * s, va_list * va)
+{
+ u32 *pkt_types = va_arg (*va, u32 *);
+ uword indent __attribute__((unused)) = format_get_indent (s) + 2;
+
+ if (!*pkt_types)
+ return s;
+
+ s = format (s, "Packet Types");
+
+#define _(F, S) \
+ if (*pkt_types & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_type
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_pkt_offload_flags (u8 * s, va_list * va)
+{
+ u16 *ol_flags = va_arg (*va, u16 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_offload_flag
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *);
+ uword indent = format_get_indent (s) + 2;
+
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x,"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags,
+ format_white_space, indent,
+ mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_offload_flags, &mb->ol_flags);
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+ return s;
+}
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+#define foreach_dpdk_pkt_ext_rx_offload_flag \
+ _ (PKT_EXT_RX_PKT_ERROR, "RX Packet Error") \
+ _ (PKT_EXT_RX_BAD_FCS, "RX Bad FCS checksum") \
+ _ (PKT_EXT_RX_UDP, "RX packet with UDP L4 header") \
+ _ (PKT_EXT_RX_TCP, "RX packet with TCP L4 header") \
+ _ (PKT_EXT_RX_IPV4_FRAGMENT, "RX packet IPv4 Fragment")
+
+#define foreach_dpdk_pkt_ext_offload_flag \
+ foreach_dpdk_pkt_rx_offload_flag \
+ foreach_dpdk_pkt_ext_rx_offload_flag
+
+static inline u8 * format_dpdk_pkt_rx_offload_flags (u8 * s, va_list * va)
+{
+ u16 *ol_flags = va_arg (*va, u16 *);
+ uword indent = format_get_indent (s) + 2;
+
+ if (!*ol_flags)
+ return s;
+
+ s = format (s, "Packet RX Offload Flags");
+
+#define _(F, S) \
+ if (*ol_flags & F) \
+ { \
+ s = format (s, "\n%U%s (0x%04x) %s", \
+ format_white_space, indent, #F, F, S); \
+ }
+
+ foreach_dpdk_pkt_ext_offload_flag
+
+#undef _
+
+ return s;
+}
+
+static inline u8 * format_dpdk_rx_rte_mbuf (u8 * s, va_list * va)
+{
+ struct rte_mbuf * mb = va_arg (*va, struct rte_mbuf *);
+ uword indent = format_get_indent (s) + 2;
+
+ /*
+ * Note: Assumes mb is head of pkt chain -- port, nb_segs, & pkt_len
+ * are only valid for the 1st mbuf segment.
+ */
+ s = format (s, "PKT MBUF: port %d, nb_segs %d, pkt_len %d"
+ "\n%Ubuf_len %d, data_len %d, ol_flags 0x%x"
+ "\n%Upacket_type 0x%x",
+ mb->port, mb->nb_segs, mb->pkt_len,
+ format_white_space, indent,
+ mb->buf_len, mb->data_len, mb->ol_flags,
+ format_white_space, indent,
+ mb->packet_type);
+
+ if (mb->ol_flags)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_rx_offload_flags, &mb->ol_flags);
+
+ if (mb->packet_type)
+ s = format (s, "\n%U%U", format_white_space, indent,
+ format_dpdk_pkt_types, &mb->packet_type);
+ return s;
+}
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+/* These args appear by themselves */
+#define foreach_eal_double_hyphen_predicate_arg \
+_(no-shconf) \
+_(no-hpet) \
+_(no-pci) \
+_(no-huge) \
+_(vmware-tsc-map) \
+_(virtio-vhost)
+
+#define foreach_eal_single_hyphen_mandatory_arg \
+_(coremask, c) \
+_(nchannels, n) \
+
+#define foreach_eal_single_hyphen_arg \
+_(blacklist, b) \
+_(mem-alloc-request, m) \
+_(force-ranks, r)
+
+/* These args are preceeded by "--" and followed by a single string */
+#define foreach_eal_double_hyphen_arg \
+_(huge-dir) \
+_(proc-type) \
+_(file-prefix) \
+_(socket-mem) \
+_(vdev)
+
+static inline u32
+dpdk_rx_burst ( dpdk_main_t * dm, dpdk_device_t * xd, u16 queue_id)
+{
+ u32 n_buffers;
+ u32 n_left;
+ u32 n_this_chunk;
+
+ n_left = VLIB_FRAME_SIZE;
+ n_buffers = 0;
+
+ if (PREDICT_TRUE(xd->dev_type == VNET_DPDK_DEV_ETH))
+ {
+ while (n_left)
+ {
+ n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
+ xd->rx_vectors[queue_id] + n_buffers, n_left);
+ n_buffers += n_this_chunk;
+ n_left -= n_this_chunk;
+
+ /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
+ if (n_this_chunk < 32)
+ break;
+ }
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ {
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ unsigned socket_id = rte_socket_id();
+
+ if (PREDICT_FALSE(!xd->vu_is_running))
+ return 0;
+
+ n_buffers = rte_vhost_dequeue_burst(&xd->vu_vhost_dev, VIRTIO_TXQ,
+ bm->pktmbuf_pools[socket_id],
+ xd->rx_vectors[queue_id], VLIB_FRAME_SIZE);
+
+ f64 now = vlib_time_now (dm->vlib_main);
+
+ /* send pending interrupts if needed */
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_TXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_TXQ]);
+ vring->n_since_last_int += n_buffers;
+
+ if ((vring->n_since_last_int && (vring->int_deadline < now))
+ || (vring->n_since_last_int > dm->vhost_coalesce_frames))
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_TXQ);
+ }
+
+ if (dpdk_vhost_user_want_interrupt(xd, VIRTIO_RXQ)) {
+ dpdk_vu_vring *vring = &(xd->vu_intf->vrings[VIRTIO_RXQ]);
+ if (vring->n_since_last_int && (vring->int_deadline < now))
+ dpdk_vhost_user_send_interrupt(dm->vlib_main, xd, VIRTIO_RXQ);
+ }
+
+ }
+ else if (xd->dev_type == VNET_DPDK_DEV_KNI)
+ {
+ n_buffers = rte_kni_rx_burst(xd->kni, xd->rx_vectors[queue_id], VLIB_FRAME_SIZE);
+ rte_kni_handle_request(xd->kni);
+ }
+ else
+ {
+ ASSERT(0);
+ }
+
+ return n_buffers;
+}
+
+
+static inline void
+dpdk_update_counters (dpdk_device_t * xd, f64 now)
+{
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+ u32 my_cpu = os_get_cpu_number();
+ u64 rxerrors, last_rxerrors;
+ int len;
+
+ /* only update counters for PMD interfaces */
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return;
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (xd->admin_up != 0xff)
+ {
+ xd->time_last_stats_update = now ? now : xd->time_last_stats_update;
+ memcpy (&xd->last_stats, &xd->stats, sizeof (xd->last_stats));
+ rte_eth_stats_get (xd->device_index, &xd->stats);
+
+ /* maybe bump interface rx no buffer counter */
+ if (PREDICT_FALSE (xd->stats.rx_nombuf != xd->last_stats.rx_nombuf))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.rx_nombuf -
+ xd->last_stats.rx_nombuf);
+ }
+
+ /* missed pkt counter */
+ if (PREDICT_FALSE (xd->stats.imissed != xd->last_stats.imissed))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_MISS);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ xd->stats.imissed -
+ xd->last_stats.imissed);
+ }
+ rxerrors = xd->stats.ibadcrc
+ + xd->stats.ibadlen + xd->stats.ierrors;
+ last_rxerrors = xd->last_stats.ibadcrc
+ + xd->last_stats.ibadlen + xd->last_stats.ierrors;
+
+ if (PREDICT_FALSE (rxerrors != last_rxerrors))
+ {
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_RX_ERROR);
+
+ vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+ rxerrors - last_rxerrors);
+ }
+ }
+
+ if ((len = rte_eth_xstats_get(xd->device_index, NULL, 0)) > 0)
+ {
+ vec_validate(xd->xstats, len - 1);
+ len = rte_eth_xstats_get(xd->device_index, xd->xstats, vec_len(xd->xstats));
+ ASSERT(vec_len(xd->xstats) == len);
+ _vec_len(xd->xstats) = len;
+ }
+}
diff --git a/vnet/vnet/devices/dpdk/init.c b/vnet/vnet/devices/dpdk/init.c
new file mode 100644
index 00000000000..a4b0f01475f
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/init.c
@@ -0,0 +1,1728 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vlib/unix/physmem.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "dpdk_priv.h"
+
+dpdk_main_t dpdk_main;
+
+/* force linker to link functions used by vlib and declared weak */
+void *vlib_weakly_linked_functions[] = {
+ &rte_pktmbuf_init,
+ &rte_pktmbuf_pool_init,
+};
+
+#define LINK_STATE_ELOGS 0
+
+#define DEFAULT_HUGE_DIR "/run/vpp/hugepages"
+#define VPP_RUN_DIR "/run/vpp"
+
+/* Port configuration, mildly modified Intel app values */
+
+static struct rte_eth_conf port_conf_template = {
+ .rxmode = {
+ .split_hdr_size = 0,
+ .header_split = 0, /**< Header Split disabled */
+ .hw_ip_checksum = 0, /**< IP checksum offload disabled */
+ .hw_vlan_filter = 0, /**< VLAN filtering disabled */
+ .hw_strip_crc = 1, /**< CRC stripped by hardware */
+ },
+ .txmode = {
+ .mq_mode = ETH_MQ_TX_NONE,
+ },
+};
+
+clib_error_t *
+dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ int rv;
+ int j;
+
+ ASSERT(os_get_cpu_number() == 0);
+
+ if (xd->admin_up) {
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index, 0);
+ rte_eth_dev_stop (xd->device_index);
+ }
+
+ rv = rte_eth_dev_configure (xd->device_index, xd->rx_q_used,
+ xd->tx_q_used, &xd->port_conf);
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ /* Set up one TX-queue per worker thread */
+ for (j = 0; j < xd->tx_q_used; j++)
+ {
+ rv = rte_eth_tx_queue_setup(xd->device_index, j, xd->nb_tx_desc,
+ xd->cpu_socket, &xd->tx_conf);
+ if (rv < 0)
+ break;
+ }
+
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_tx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+
+ rv = rte_eth_rx_queue_setup(xd->device_index, j, xd->nb_rx_desc,
+ xd->cpu_socket, 0,
+ bm->pktmbuf_pools[xd->cpu_socket_id_by_queue[j]]);
+ if (rv < 0)
+ return clib_error_return (0, "rte_eth_rx_queue_setup[%d]: err %d",
+ xd->device_index, rv);
+ }
+
+ if (xd->admin_up) {
+ rte_eth_dev_start (xd->device_index);
+ }
+ return 0;
+}
+
+static u32 dpdk_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+ u32 old = 0;
+
+ if (ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC(flags))
+ {
+ old = xd->promisc;
+ xd->promisc = flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL;
+
+ if (xd->admin_up)
+ {
+ if (xd->promisc)
+ rte_eth_promiscuous_enable(xd->device_index);
+ else
+ rte_eth_promiscuous_disable(xd->device_index);
+ }
+ }
+ else if (ETHERNET_INTERFACE_FLAG_CONFIG_MTU(flags))
+ {
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VICE ||
+ xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ struct rte_eth_dev_info dev_info;
+
+ /*
+ * Restore mtu to what has been set by CIMC in the firmware cfg.
+ */
+ rte_eth_dev_info_get(xd->device_index, &dev_info);
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+
+ vlib_cli_output (vlib_get_main(),
+ "Cisco VIC mtu can only be changed "
+ "using CIMC then rebooting the server!");
+ }
+ else
+ {
+ int rv;
+
+ /*
+ * DAW-FIXME: The DPDK VMXNET3 driver does not currently support
+ * multi-buffer packets. Max out at 1518 bytes for now.
+ *
+ * If/when the driver gets fixed, then this should be
+ * removed.
+ */
+ if ((xd->pmd == VNET_DPDK_PMD_VMXNET3) &&
+ (hi->max_packet_bytes > 1518))
+ {
+ hi->max_packet_bytes = 1518;
+
+ vlib_cli_output (vlib_get_main(),
+ "VMXNET3 driver does not support jumbo frames "
+ "yet -- setting mtu to 1518!");
+ }
+
+ xd->port_conf.rxmode.max_rx_pkt_len = hi->max_packet_bytes;
+
+ if (xd->admin_up)
+ rte_eth_dev_stop (xd->device_index);
+
+ rv = rte_eth_dev_configure
+ (xd->device_index,
+ xd->rx_q_used,
+ xd->tx_q_used,
+ &xd->port_conf);
+
+ if (rv < 0)
+ vlib_cli_output (vlib_get_main(),
+ "rte_eth_dev_configure[%d]: err %d",
+ xd->device_index, rv);
+
+ rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes);
+
+ if (xd->admin_up)
+ rte_eth_dev_start (xd->device_index);
+ }
+ }
+ return old;
+}
+
+#ifdef NETMAP
+extern int rte_netmap_probe(void);
+#endif
+
+static clib_error_t *
+dpdk_lib_init (dpdk_main_t * dm)
+{
+ u32 nports;
+ u32 nb_desc = 0;
+ int i;
+ clib_error_t * error;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vnet_sw_interface_t * sw;
+ vnet_hw_interface_t * hi;
+ dpdk_device_t * xd;
+ vlib_thread_registration_t * tr;
+ uword * p;
+
+ u32 next_cpu = 0;
+ u8 af_packet_port_id = 0;
+
+ dm->input_cpu_first_index = 0;
+ dm->input_cpu_count = 1;
+
+ /* find out which cpus will be used for input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "io");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+
+ if (!tr || tr->count == 0)
+ {
+ /* no io threads, workers doing input */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+ }
+ else
+ {
+ dm->have_io_threads = 1;
+ }
+
+ if (tr && tr->count > 0)
+ {
+ dm->input_cpu_first_index = tr->first_index;
+ dm->input_cpu_count = tr->count;
+ }
+
+ vec_validate_aligned (dm->devices_by_cpu, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (dm->workers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+#ifdef NETMAP
+ if(rte_netmap_probe() < 0)
+ return clib_error_return (0, "rte netmap probe failed");
+#endif
+
+ nports = rte_eth_dev_count();
+ if (nports < 1)
+ {
+ clib_warning ("DPDK drivers found no ports...");
+ }
+
+ if (CLIB_DEBUG > 0)
+ clib_warning ("DPDK drivers found %d ports...", nports);
+
+ /*
+ * All buffers are all allocated from the same rte_mempool.
+ * Thus they all have the same number of data bytes.
+ */
+ dm->vlib_buffer_free_list_index =
+ vlib_buffer_get_or_create_free_list (
+ vm, VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, "dpdk rx");
+
+ for (i = 0; i < nports; i++)
+ {
+ u8 addr[6];
+ int j;
+ struct rte_eth_dev_info dev_info;
+ clib_error_t * rv;
+ struct rte_eth_link l;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_DEFAULT;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_DEFAULT;
+ xd->cpu_socket = (i8) rte_eth_dev_socket_id(i);
+ rte_eth_dev_info_get(i, &dev_info);
+
+ memcpy(&xd->tx_conf, &dev_info.default_txconf,
+ sizeof(struct rte_eth_txconf));
+ if (dm->no_multi_seg)
+ {
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 0;
+ }
+ else
+ {
+ xd->tx_conf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ port_conf_template.rxmode.jumbo_frame = 1;
+ }
+
+ memcpy(&xd->port_conf, &port_conf_template, sizeof(struct rte_eth_conf));
+
+ xd->tx_q_used = dev_info.max_tx_queues < tm->n_vlib_mains ?
+ 1 : tm->n_vlib_mains;
+
+ if (dm->use_rss > 1 && dev_info.max_rx_queues >= dm->use_rss)
+ {
+ xd->rx_q_used = dm->use_rss;
+ xd->port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ xd->port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP;
+ }
+ else
+ xd->rx_q_used = 1;
+
+ xd->dev_type = VNET_DPDK_DEV_ETH;
+ if (!xd->pmd) {
+
+
+#define _(s,f) else if (!strcmp(dev_info.driver_name, s)) \
+ xd->pmd = VNET_DPDK_PMD_##f;
+ if (0)
+ ;
+ foreach_dpdk_pmd
+#undef _
+ else
+ xd->pmd = VNET_DPDK_PMD_UNKNOWN;
+
+
+ switch (xd->pmd) {
+ /* 1G adapters */
+ case VNET_DPDK_PMD_E1000EM:
+ case VNET_DPDK_PMD_IGB:
+ case VNET_DPDK_PMD_IGBVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ break;
+
+ /* 10G adapters */
+ case VNET_DPDK_PMD_IXGBE:
+ case VNET_DPDK_PMD_IXGBEVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE;
+ break;
+
+ /* Cisco VIC */
+ case VNET_DPDK_PMD_VICE:
+ case VNET_DPDK_PMD_ENIC:
+ rte_eth_link_get_nowait(xd->device_index, &l);
+ if (l.link_speed == 40000)
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+ }
+ else
+ {
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_10GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_10GE;
+ }
+ break;
+
+ /* Intel Fortville */
+ case VNET_DPDK_PMD_I40E:
+ case VNET_DPDK_PMD_I40EVF:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+
+ switch (dev_info.pci_dev->id.device_id) {
+ case I40E_DEV_ID_10G_BASE_T:
+ case I40E_DEV_ID_SFP_XL710:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_10G;
+ break;
+ case I40E_DEV_ID_QSFP_A:
+ case I40E_DEV_ID_QSFP_B:
+ case I40E_DEV_ID_QSFP_C:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ case I40E_DEV_ID_VF:
+ rte_eth_link_get_nowait(xd->device_index, &l);
+ xd->port_type = l.link_speed == 10000 ?
+ VNET_DPDK_PORT_TYPE_ETH_10G : VNET_DPDK_PORT_TYPE_ETH_40G;
+ break;
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+ break;
+
+ /* Intel Red Rock Canyon */
+ case VNET_DPDK_PMD_FM10K:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_SWITCH;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_40GE;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_40GE;
+ break;
+
+ /* virtio */
+ case VNET_DPDK_PMD_VIRTIO:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->nb_rx_desc = DPDK_NB_RX_DESC_VIRTIO;
+ xd->nb_tx_desc = DPDK_NB_TX_DESC_VIRTIO;
+ break;
+
+ /* vmxnet3 */
+ case VNET_DPDK_PMD_VMXNET3:
+ xd->port_type = VNET_DPDK_PORT_TYPE_ETH_1G;
+ xd->tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ break;
+
+ case VNET_DPDK_PMD_AF_PACKET:
+ xd->port_type = VNET_DPDK_PORT_TYPE_AF_PACKET;
+ xd->af_packet_port_id = af_packet_port_id++;
+ break;
+
+ default:
+ xd->port_type = VNET_DPDK_PORT_TYPE_UNKNOWN;
+ }
+
+ #ifdef NETMAP
+ if(strncmp(dev_info.driver_name, "vale", 4) == 0
+ || strncmp(dev_info.driver_name, "netmap", 6) == 0)
+ {
+ xd->pmd = VNET_DPDK_PMD_NETMAP;
+ xd->port_type = VNET_DPDK_PORT_TYPE_NETMAP;
+ }
+ #endif
+
+ }
+
+ /*
+ * Ensure default mtu is not > the mtu read from the hardware.
+ * Otherwise rte_eth_dev_configure() will fail and the port will
+ * not be available.
+ */
+ xd->port_conf.rxmode.max_rx_pkt_len =
+ (ETHERNET_MAX_PACKET_BYTES > dev_info.max_rx_pktlen) ?
+ dev_info.max_rx_pktlen : ETHERNET_MAX_PACKET_BYTES;
+
+ /*
+ * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VMXNET3)
+ {
+ xd->port_conf.rxmode.max_rx_pkt_len = 1518;
+ xd->port_conf.rxmode.jumbo_frame = 0;
+ }
+
+ if (xd->pmd == VNET_DPDK_PMD_AF_PACKET)
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+ else
+ rte_eth_macaddr_get(i,(struct ether_addr *)addr);
+
+ if (xd->tx_q_used < tm->n_vlib_mains)
+ {
+ xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+ }
+
+ xd->device_index = xd - dm->devices;
+ ASSERT(i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t * dq;
+ int q;
+
+ for (q = 0; q < xd->rx_q_used; q++)
+ {
+ int cpu = dm->input_cpu_first_index + next_cpu;
+ unsigned lcore = vlib_worker_threads[cpu].dpdk_lcore_id;
+
+ /*
+ * numa node for worker thread handling this queue
+ * needed for taking buffers from the right mempool
+ */
+ vec_validate(xd->cpu_socket_id_by_queue, q);
+ xd->cpu_socket_id_by_queue[q] = rte_lcore_to_socket_id(lcore);
+
+ /*
+ * construct vector of (device,queue) pairs for each worker thread
+ */
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = q;
+
+ next_cpu++;
+ if (next_cpu == dm->input_cpu_count)
+ next_cpu = 0;
+ }
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j< xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ rv = dpdk_port_setup(dm, xd);
+
+ if (rv < 0)
+ return rv;
+
+ /* count the number of descriptors used for this device */
+ nb_desc += xd->nb_rx_desc + xd->nb_tx_desc * xd->tx_q_used;
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ dpdk_flag_change);
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+
+ /*
+ * DAW-FIXME: The Cisco VIC firmware does not provide an api for a
+ * driver to dynamically change the mtu. If/when the
+ * VIC firmware gets fixed, then this should be removed.
+ */
+ if (xd->pmd == VNET_DPDK_PMD_VICE ||
+ xd->pmd == VNET_DPDK_PMD_ENIC)
+ {
+ /*
+ * Initialize mtu to what has been set by CIMC in the firmware cfg.
+ */
+ hi->max_packet_bytes = dev_info.max_rx_pktlen;
+ /*
+ * remove vlan tag from VIC port to fix VLAN0 issue.
+ * TODO Handle VLAN tagged traffic
+ */
+ int vlan_off;
+ vlan_off = rte_eth_dev_get_vlan_offload(xd->device_index);
+ vlan_off |= ETH_VLAN_STRIP_OFFLOAD;
+ rte_eth_dev_set_vlan_offload(xd->device_index, vlan_off);
+ }
+ /*
+ * DAW-FIXME: VMXNET3 driver doesn't support jumbo / multi-buffer pkts
+ */
+ else if (xd->pmd == VNET_DPDK_PMD_VMXNET3)
+ hi->max_packet_bytes = 1518;
+
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] =
+ xd->port_conf.rxmode.max_rx_pkt_len - sizeof(ethernet_header_t);
+
+ rte_eth_dev_set_mtu(xd->device_index, hi->max_packet_bytes);
+ }
+
+ if (dm->num_kni) {
+ clib_warning("Initializing KNI interfaces...");
+ rte_kni_init(dm->num_kni);
+ for (i = 0; i < dm->num_kni; i++)
+ {
+ u8 addr[6];
+ int j;
+
+ /* Create vnet interface */
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->dev_type = VNET_DPDK_DEV_KNI;
+
+ xd->device_index = xd - dm->devices;
+ ASSERT(nports + i == xd->device_index);
+ xd->per_interface_next_index = ~0;
+ xd->kni_port_id = i;
+ xd->cpu_socket = -1;
+ hash_set (dm->dpdk_device_by_kni_port_id, i, xd - dm->devices);
+ xd->rx_q_used = 1;
+
+ /* assign interface to input thread */
+ dpdk_device_and_queue_t * dq;
+ vec_add2(dm->devices_by_cpu[dm->input_cpu_first_index], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used,
+ CLIB_CACHE_LINE_BYTES);
+ for (j = 0; j< xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ /* FIXME Set up one TX-queue per worker thread */
+
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ dpdk_flag_change);
+
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+ hi = vnet_get_hw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ }
+ }
+
+ if (nb_desc > dm->num_mbufs)
+ clib_warning ("%d mbufs allocated but total rx/tx ring size is %d\n",
+ dm->num_mbufs, nb_desc);
+
+ /* init next vhost-user if index */
+ dm->next_vu_if_id = 0;
+
+ return 0;
+}
+
+/*
+ * Tell the vlib physical memory allocator that we've handled
+ * the initialization. We don't actually do so until
+ * vlib_main(...) callls the dpdk config function.
+ */
+int vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm,
+ int physmem_required)
+{
+ return 1;
+}
+
+static clib_error_t *
+write_sys_fs (char * file_name, char * fmt, ...)
+{
+ u8 * s;
+ int fd;
+
+ fd = open (file_name, O_WRONLY);
+ if (fd < 0)
+ return clib_error_return_unix (0, "open `%s'", file_name);
+
+ va_list va;
+ va_start (va, fmt);
+ s = va_format (0, fmt, &va);
+ va_end (va);
+ vec_add1 (s, 0); // terminate c string
+
+ if (write (fd, s, vec_len (s)) < 0)
+ return clib_error_return_unix (0, "write '%s' to '%s'", s, file_name);
+
+ vec_free (s);
+ close (fd);
+ return 0;
+}
+
+#define VIRTIO_PCI_NAME "virtio-pci"
+
+static clib_error_t * dpdk_bind_eth_kernel_drivers (vlib_main_t * vm,
+ char * pci_dev_id,
+ char * kernel_driver)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ unformat_input_t _in;
+ unformat_input_t * in = &_in;
+ clib_error_t * error = 0;
+ u8 * line = 0, * modcmd = 0, * path = 0;
+ u8 * pci_vid = 0, *pci_did = 0, * devname = 0;
+ char *driver_name = kernel_driver;
+ FILE * fp;
+
+ /*
+ * Bail out now if we're not running as root.
+ * This allows non-privileged use of the packet generator, etc.
+ */
+ if (geteuid() != 0)
+ return 0;
+
+ /*
+ * Get all ethernet pci device numbers for the device type specified.
+ */
+ modcmd = format (0, "lspci -nDd %s | grep 0200 | "
+ "awk '{ print $1, $3 }'%c", pci_dev_id, 0);
+ if ((fp = popen ((const char *)modcmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0,
+ "Unable to get %s ethernet pci devices.",
+ pci_dev_id);
+ goto done;
+ }
+
+ vec_validate (line, BUFSIZ);
+ vec_validate (path, BUFSIZ);
+ while (fgets ((char *)line, BUFSIZ, fp) != NULL)
+ {
+ struct stat st;
+ u8 bind_uio = 1;
+ line[strlen ((char *)line) - 1] = 0; // chomp trailing newline.
+
+ unformat_init_string (in, (char *)line, strlen((char *)line) + 1);
+ unformat(in, "%s %s:%s", &devname, &pci_vid, &pci_did);
+ unformat_free (in);
+
+ /*
+ * Blacklist all ethernet interfaces in the
+ * linux IP routing tables (route --inet --inet6)
+ */
+ if (strstr ((char *)dm->eth_if_blacklist, (char *)devname))
+ continue;
+
+ /*
+ * If there are any devices whitelisted, then blacklist all devices
+ * which are not explicitly whitelisted.
+ */
+ if (dm->eth_if_whitelist &&
+ !strstr ((char *)dm->eth_if_whitelist, (char *)devname))
+ continue;
+
+#ifdef NETMAP
+ /*
+ * Optimistically open the device as a netmap device.
+ */
+ if (eth_nm_open((char *)devname))
+ continue;
+#endif
+
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/devices/%s/driver/unbind%c",
+ devname, 0);
+
+ /*
+ * If the device is bound to a driver...
+ */
+ if (stat ((const char *)path, &st) == 0)
+ {
+ u8 * device_path;
+
+ /*
+ * If the interface is not a virtio...
+ */
+ if (!driver_name || strcmp(driver_name, VIRTIO_PCI_NAME))
+ {
+ /*
+ * If it is already bound to driver, don't unbind/bind it.
+ */
+ device_path = format (0, "/sys/bus/pci/drivers/%s/%s/device%c",
+ driver_name, devname, 0);
+ if (stat ((const char *)device_path, &st) == 0)
+ bind_uio = 0;
+
+ vec_free (device_path);
+ }
+
+ /*
+ * unbind it from the current driver
+ */
+ if (bind_uio)
+ {
+ _vec_len (path) -= 1;
+ path = format (path, "%c", 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+ }
+ }
+
+ /*
+ * DAW-FIXME: The following bind/unbind dance is necessary for the dpdk
+ * virtio poll-mode driver to work.
+ */
+
+ if (driver_name && !strcmp(driver_name, VIRTIO_PCI_NAME))
+ {
+ /*
+ * bind interface to the native kernel module
+ */
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/bind%c",
+ driver_name, 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+
+ /*
+ * unbind interface from the native kernel module
+ */
+ _vec_len (path) -= 5;
+ path = format (path, "unbind%c", 0);
+ error = write_sys_fs ((char *)path, "%s", devname);
+ if (error)
+ goto done;
+ }
+
+ /*
+ * bind the interface to igb_uio
+ */
+ if (bind_uio)
+ {
+ int pci_vendor_id = strtol((char *) pci_vid, NULL, 16);
+ int pci_device_id = strtol((char *) pci_did, NULL, 16);
+
+ /*
+ * Set PCI ID to ".../virtio-pci/new_id" for Intel fortvile adapaters
+ */
+ if (pci_vendor_id == 0x8086 &&
+ (pci_device_id == I40E_DEV_ID_10G_BASE_T ||
+ pci_device_id == I40E_DEV_ID_SFP_XL710 ||
+ pci_device_id == I40E_DEV_ID_QSFP_A ||
+ pci_device_id == I40E_DEV_ID_QSFP_B ||
+ pci_device_id == I40E_DEV_ID_QSFP_C))
+ {
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/new_id%c", driver_name, 0);
+ error = write_sys_fs ((char *) path, "%s %s", pci_vid, pci_did);
+ if (error)
+ continue;
+ }
+
+ _vec_len (path) = 0;
+ path = format (path, "/sys/bus/pci/drivers/%s/bind%c", driver_name, 0);
+ error = write_sys_fs ((char *) path, "%s", devname);
+ if (error)
+ {
+ error = 0;
+ continue;
+ }
+ }
+ }
+
+ done:
+ vec_free (line);
+ vec_free (path);
+ vec_free (devname);
+ vec_free (pci_vid);
+ vec_free (pci_did);
+ vec_free (modcmd);
+ pclose (fp);
+ return error;
+}
+
+static uword
+unformat_socket_mem (unformat_input_t * input, va_list * va)
+{
+ uword ** r = va_arg (* va, uword **);
+ int i = 0;
+ u32 mem;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, ","))
+ hash_set (*r, i, 1024);
+ else if (unformat (input, "%u,", &mem))
+ hash_set (*r, i, mem);
+ else if (unformat (input, "%u", &mem))
+ hash_set (*r, i, mem);
+ else
+ {
+ unformat_put_input (input);
+ goto done;
+ }
+ i++;
+ }
+
+done:
+ return 1;
+}
+
+static u32
+get_node_free_hugepages_num (u32 node, u32 page_size)
+{
+ FILE * fp;
+ u8 * tmp;
+
+ tmp = format (0, "/sys/devices/system/node/node%u/hugepages/hugepages-%ukB/"
+ "free_hugepages%c", node, page_size, 0);
+ fp = fopen ((char *) tmp, "r");
+ vec_free(tmp);
+
+ if (fp != NULL)
+ {
+ u8 * buffer = 0;
+ u32 pages_avail = 0;
+
+ vec_validate (buffer, 256-1);
+ if (fgets ((char *)buffer, 256, fp))
+ {
+ unformat_input_t in;
+ unformat_init_string (&in, (char *) buffer, strlen ((char *) buffer));
+ unformat(&in, "%u", &pages_avail);
+ unformat_free (&in);
+ }
+ vec_free(buffer);
+ fclose(fp);
+ return pages_avail;
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ clib_error_t * error = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ u8 * s, * tmp = 0;
+ u8 * pci_dev_id = 0;
+ u8 * rte_cmd = 0, * ethname = 0;
+ FILE * rte_fp;
+ u32 log_level;
+ int ret, i;
+ char * fmt;
+#ifdef NETMAP
+ int rxrings, txrings, rxslots, txslots, txburst;
+ char * nmnam;
+#endif
+ unformat_input_t _in;
+ unformat_input_t * in = &_in;
+ u8 no_pci = 0;
+ u8 no_huge = 0;
+ u8 huge_dir = 0;
+ u8 file_prefix = 0;
+ u8 * socket_mem = 0;
+
+ // MATT-FIXME: inverted virtio-vhost logic to use virtio by default
+ dm->use_virtio_vhost = 1;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* Prime the pump */
+ if (unformat (input, "no-hugetlb"))
+ {
+ vec_add1 (dm->eal_init_args, (u8 *) "no-huge");
+ no_huge = 1;
+ }
+
+ else if (unformat (input, "decimal-interface-names"))
+ dm->interface_name_format_decimal = 1;
+
+ else if (unformat (input, "no-multi-seg"))
+ dm->no_multi_seg = 1;
+
+ else if (unformat (input, "dev %s", &pci_dev_id))
+ {
+ if (dm->eth_if_whitelist)
+ {
+ /*
+ * Don't add duplicate device id's.
+ */
+ if (strstr ((char *)dm->eth_if_whitelist, (char *)pci_dev_id))
+ continue;
+
+ _vec_len (dm->eth_if_whitelist) -= 1; // chomp trailing NULL.
+ dm->eth_if_whitelist = format (dm->eth_if_whitelist, " %s%c",
+ pci_dev_id, 0);
+ }
+ else
+ dm->eth_if_whitelist = format (0, "%s%c", pci_dev_id, 0);
+ }
+
+#ifdef NETMAP
+ else if (unformat(input, "netmap %s/%d:%d/%d:%d/%d",
+ &nmname, &rxrings, &rxslots, &txrings, &txslots, &txburst)) {
+ char * rv;
+ rv = (char *)
+ eth_nm_args(nmname, rxrings, rxslots, txrings, txslots, txburst);
+ if (rv) {
+ error = clib_error_return (0, "%s", rv);
+ goto done;
+ }
+ }else if (unformat(input, "netmap %s", &nmname)) {
+ char * rv;
+ rv = (char *)
+ eth_nm_args(nmname, 0, 0, 0, 0, 0);
+ if (rv) {
+ error = clib_error_return (0, "%s", rv);
+ goto done;
+ }
+ }
+#endif
+
+ else if (unformat (input, "num-mbufs %d", &dm->num_mbufs))
+ ;
+ else if (unformat (input, "kni %d", &dm->num_kni))
+ ;
+ else if (unformat (input, "uio-driver %s", &dm->uio_driver_name))
+ ;
+ else if (unformat (input, "vhost-user-coalesce-frames %d", &dm->vhost_coalesce_frames))
+ ;
+ else if (unformat (input, "vhost-user-coalesce-time %f", &dm->vhost_coalesce_time))
+ ;
+ else if (unformat (input, "enable-vhost-user"))
+ dm->use_virtio_vhost = 0;
+ else if (unformat (input, "rss %d", &dm->use_rss))
+ ;
+
+#define _(a) \
+ else if (unformat(input, #a)) \
+ { \
+ if (!strncmp(#a, "no-pci", 6)) \
+ no_pci = 1; \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ }
+ foreach_eal_double_hyphen_predicate_arg
+#undef _
+
+#define _(a) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ if (!strncmp(#a, "huge-dir", 8)) \
+ huge_dir = 1; \
+ else if (!strncmp(#a, "file-prefix", 11)) \
+ file_prefix = 1; \
+ else if (!strncmp(#a, "socket-mem", 10)) \
+ socket_mem = vec_dup (s); \
+ tmp = format (0, "--%s%c", #a, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ }
+ foreach_eal_double_hyphen_arg
+#undef _
+
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ }
+ foreach_eal_single_hyphen_arg
+#undef _
+
+#define _(a,b) \
+ else if (unformat(input, #a " %s", &s)) \
+ { \
+ tmp = format (0, "-%s%c", #b, 0); \
+ vec_add1 (dm->eal_init_args, tmp); \
+ vec_add1 (s, 0); \
+ vec_add1 (dm->eal_init_args, s); \
+ dm->a##_set_manually = 1; \
+ }
+ foreach_eal_single_hyphen_mandatory_arg
+#undef _
+
+ else if (unformat(input, "default"))
+ ;
+
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!dm->uio_driver_name)
+ dm->uio_driver_name = format (0, "igb_uio");
+
+ /*
+ * Use 1G huge pages if available.
+ */
+ if (!no_huge && !huge_dir)
+ {
+ uword * mem_by_socket = hash_create (0, sizeof (uword));
+ uword c;
+ u8 use_1g = 1;
+ u8 use_2m = 1;
+ int rv;
+
+ umount(DEFAULT_HUGE_DIR);
+
+ /* Process "socket-mem" parameter value */
+ if (vec_len (socket_mem))
+ {
+ unformat_input_t in;
+ unformat_init_vector(&in, socket_mem);
+ unformat(&in, "%U", unformat_socket_mem, &mem_by_socket);
+ unformat_free(&in);
+ }
+ else
+ use_1g = 0;
+
+ /* check if available enough 1GB pages for each socket */
+ clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ({
+ uword * p = hash_get (mem_by_socket, c);
+ if (p)
+ {
+ u32 mem = p[0];
+ if (mem)
+ {
+ u32 pages_num_1g = mem / 1024;
+ u32 pages_num_2m = mem / 2;
+ u32 pages_avail;
+
+ pages_avail = get_node_free_hugepages_num(c, 1048576);
+ if (!(pages_avail >= pages_num_1g))
+ use_1g = 0;
+
+ pages_avail = get_node_free_hugepages_num(c, 2048);
+ if (!(pages_avail >= pages_num_2m))
+ use_2m = 0;
+ }
+ }
+ }));
+
+ hash_free (mem_by_socket);
+
+ rv = mkdir(VPP_RUN_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ VPP_RUN_DIR, errno);
+ goto done;
+ }
+
+ rv = mkdir(DEFAULT_HUGE_DIR, 0755);
+ if (rv && errno != EEXIST)
+ {
+ error = clib_error_return (0, "mkdir '%s' failed errno %d",
+ DEFAULT_HUGE_DIR, errno);
+ goto done;
+ }
+
+ if (use_1g)
+ {
+ rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G");
+ }
+ else if (use_2m)
+ {
+ rv = mount("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL);
+ }
+ else
+ {
+ return clib_error_return (0, "not enough free huge pages");
+ }
+
+ if (rv)
+ {
+ error = clib_error_return (0, "mount failed %d", errno);
+ goto done;
+ }
+
+ tmp = format (0, "--huge-dir%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ if (!file_prefix)
+ {
+ tmp = format (0, "--file-prefix%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "vpp%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ }
+ }
+
+ /*
+ * Blacklist all ethernet interfaces in the linux IP routing tables.
+ */
+ dm->eth_if_blacklist = format (0, "%c", 0);
+ rte_cmd = format (0, "route --inet --inet6 -n|awk '{print $7}'|sort -u|"
+ "egrep $(echo $(ls -1d /sys/class/net/*/device|"
+ "cut -d/ -f5)|sed -s 's/ /|/g')%c", 0);
+ if ((rte_fp = popen ((const char *)rte_cmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0, "Unable to find blacklist ethernet"
+ " interface(s) in linux routing tables.");
+ goto rte_cmd_err;
+
+ }
+
+ vec_validate (ethname, BUFSIZ);
+ while (fgets ((char *)ethname, BUFSIZ, rte_fp) != NULL)
+ {
+ FILE *rlnk_fp;
+ u8 * rlnk_cmd = 0, * devname = 0;
+
+ ethname[strlen ((char *)ethname) - 1] = 0; // chomp trailing newline.
+
+ rlnk_cmd = format (0, "readlink /sys/class/net/%s%c",
+ ethname, 0);
+
+ if ((rlnk_fp = popen ((const char *)rlnk_cmd, "r")) == NULL)
+ {
+ error = clib_error_return_unix (0, "Unable to read %s link.",
+ ethname);
+ goto rlnk_cmd_err;
+ }
+
+ vec_validate (devname, BUFSIZ);
+ while (fgets ((char *)devname, BUFSIZ, rlnk_fp) != NULL)
+ {
+ char * pci_id = 0;
+
+ /*
+ * Extract the device PCI ID name from the link. It is the first
+ * PCI ID searching backwards from the end of the link pathname.
+ * For example:
+ * readlink /sys/class/net/eth0
+ * ../../devices/pci0000:00/0000:00:0a.0/virtio4/net/eth0
+ */
+ for (pci_id = (char *)((devname + strlen((char *)devname)));
+ ((u8 *)pci_id > devname) && *pci_id != '.'; pci_id--)
+ ;
+
+ /*
+ * Verify that the field found is a valid PCI ID.
+ */
+ if ((*(pci_id - 1) == '.') || ((u8 *)(pci_id - 11) < devname) ||
+ (*(pci_id - 11) != '/') || (*(pci_id - 3) != ':') ||
+ (*(pci_id - 6) != ':'))
+ {
+ devname[strlen ((char *)devname) - 1] = 0; // chomp trailing newline.
+ clib_warning ("Unable to extract %s PCI ID (0x%llx \"%s\") "
+ "from 0x%llx \"%s\"", ethname, pci_id, pci_id,
+ devname, devname);
+ continue;
+ }
+
+ pci_id[2] = 0;
+ pci_id -= 10;
+
+ /* Don't blacklist any interfaces which have been whitelisted.
+ */
+ if (dm->eth_if_whitelist &&
+ strstr ((char *)dm->eth_if_whitelist, (char *)pci_id))
+ continue;
+
+ _vec_len (dm->eth_if_blacklist) -= 1; // chomp trailing NULL.
+ dm->eth_if_blacklist = format (dm->eth_if_blacklist, " %s%c",
+ pci_id, 0);
+ }
+
+ rlnk_cmd_err:
+ pclose (rlnk_fp);
+ vec_free (rlnk_cmd);
+ vec_free (devname);
+ }
+
+ rte_cmd_err:
+ pclose (rte_fp);
+ vec_free (rte_cmd);
+ vec_free (ethname);
+
+ if (error)
+ return error;
+
+ /* I'll bet that -c and -n must be the first and second args... */
+ if (!dm->coremask_set_manually)
+ {
+ vlib_thread_registration_t * tr;
+ uword coremask;
+ int i;
+
+ /* main thread core */
+ coremask = 1 << tm->main_lcore;
+
+ for (i = 0; i < vec_len (tm->registrations); i++)
+ {
+ tr = tm->registrations[i];
+ if (clib_bitmap_is_zero(tr->coremask))
+ continue;
+ coremask |= tr->coremask[0];
+ }
+
+ vec_insert (dm->eal_init_args, 2, 1);
+ dm->eal_init_args[1] = (u8 *) "-c";
+ tmp = format (0, "%x%c", coremask, 0);
+ dm->eal_init_args[2] = tmp;
+ }
+
+ if (!dm->nchannels_set_manually)
+ {
+ vec_insert (dm->eal_init_args, 2, 3);
+ dm->eal_init_args[3] = (u8 *) "-n";
+ tmp = format (0, "%d", dm->nchannels);
+ dm->eal_init_args[4] = tmp;
+ }
+
+ /*
+ * If there are whitelisted devices,
+ * add the whitelist option & device list to the dpdk arg list...
+ */
+ if (dm->eth_if_whitelist)
+ {
+ unformat_init_string (in, (char *)dm->eth_if_whitelist,
+ vec_len(dm->eth_if_whitelist) - 1);
+ fmt = "-w%c";
+ }
+
+ /*
+ * Otherwise add the blacklisted devices to the dpdk arg list.
+ */
+ else
+ {
+ unformat_init_string (in, (char *)dm->eth_if_blacklist,
+ vec_len(dm->eth_if_blacklist) - 1);
+ fmt = "-b%c";
+ }
+
+ while (unformat_check_input (in) != UNFORMAT_END_OF_INPUT)
+ {
+ tmp = format (0, fmt, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ unformat (in, "%s", &pci_dev_id);
+ vec_add1 (dm->eal_init_args, pci_dev_id);
+ }
+
+ if (no_pci == 0)
+ {
+ /*
+ * Bind Virtio pci devices to the igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "1af4:1000", VIRTIO_PCI_NAME);
+ if (error)
+ return error;
+
+ /*
+ * Bind vmxnet3 pci devices to the igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "15ad:07b0",
+ (char *) dm->uio_driver_name);
+ if (error)
+ return error;
+
+ /*
+ * Bind Intel ethernet pci devices to igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "8086:",
+ (char *) dm->uio_driver_name);
+ /*
+ * Bind Cisco VIC ethernet pci devices to igb_uio kernel driver.
+ */
+ error = dpdk_bind_eth_kernel_drivers (vm, "1137:0043",
+ (char *) dm->uio_driver_name);
+ }
+
+ /* set master-lcore */
+ tmp = format (0, "--master-lcore%c", 0);
+ vec_add1 (dm->eal_init_args, tmp);
+ tmp = format (0, "%u%c", tm->main_lcore, 0);
+ vec_add1 (dm->eal_init_args, tmp);
+
+ /* NULL terminate the "argv" vector, in case of stupidity */
+ vec_add1 (dm->eal_init_args, 0);
+ _vec_len(dm->eal_init_args) -= 1;
+
+ /* Set up DPDK eal and packet mbuf pool early. */
+
+ log_level = (CLIB_DEBUG > 0) ? RTE_LOG_DEBUG : RTE_LOG_NOTICE;
+
+ rte_set_log_level (log_level);
+
+ vm = dm->vlib_main;
+
+ ret = rte_eal_init(vec_len(dm->eal_init_args), (char **) dm->eal_init_args);
+
+ /* lazy umount hugepages */
+ umount2(DEFAULT_HUGE_DIR, MNT_DETACH);
+
+ if (ret < 0)
+ return clib_error_return (0, "rte_eal_init returned %d", ret);
+
+ /* main thread 1st */
+ error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE, rte_socket_id());
+ if (error)
+ return error;
+
+ for (i = 0; i < RTE_MAX_LCORE; i++)
+ {
+ error = vlib_buffer_pool_create(vm, dm->num_mbufs, MBUF_SIZE,
+ rte_lcore_to_socket_id(i));
+ if (error)
+ return error;
+ }
+
+ if (dm->use_rss)
+ {
+ vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, dpdk_input_node.index);
+ rt->function = dpdk_input_rss;
+ }
+ done:
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (dpdk_config, "dpdk");
+
+void dpdk_update_link_state (dpdk_device_t * xd, f64 now)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ struct rte_eth_link prev_link = xd->link;
+ u32 hw_flags = 0;
+ u8 hw_flags_chg = 0;
+
+ /* only update link state for PMD interfaces */
+ if (xd->dev_type != VNET_DPDK_DEV_ETH)
+ return;
+
+ xd->time_last_link_update = now ? now : xd->time_last_link_update;
+ memset(&xd->link, 0, sizeof(xd->link));
+ rte_eth_link_get_nowait (xd->device_index, &xd->link);
+
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t * vm = vlib_get_main();
+ ELOG_TYPE_DECLARE(e) = {
+ .format =
+ "update-link-state: sw_if_index %d, admin_up %d,"
+ "old link_state %d new link_state %d",
+ .format_args = "i4i1i1i1",
+ };
+
+ struct { u32 sw_if_index; u8 admin_up;
+ u8 old_link_state; u8 new_link_state;} *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->admin_up = xd->admin_up;
+ ed->old_link_state = (u8)
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index);
+ ed->new_link_state = (u8) xd->link.link_status;
+ }
+
+ if ((xd->admin_up == 1) &&
+ ((xd->link.link_status != 0) ^
+ vnet_hw_interface_is_link_up (vnm, xd->vlib_hw_if_index)))
+ {
+ hw_flags_chg = 1;
+ hw_flags |= (xd->link.link_status ?
+ VNET_HW_INTERFACE_FLAG_LINK_UP: 0);
+ }
+
+ if (hw_flags_chg || (xd->link.link_duplex != prev_link.link_duplex))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_duplex)
+ {
+ case ETH_LINK_HALF_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_HALF_DUPLEX;
+ break;
+ case ETH_LINK_FULL_DUPLEX:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_FULL_DUPLEX;
+ break;
+ default:
+ break;
+ }
+ }
+ if (hw_flags_chg || (xd->link.link_speed != prev_link.link_speed))
+ {
+ hw_flags_chg = 1;
+ switch (xd->link.link_speed)
+ {
+ case ETH_LINK_SPEED_10:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10M;
+ break;
+ case ETH_LINK_SPEED_100:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_100M;
+ break;
+ case ETH_LINK_SPEED_1000:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_1G;
+ break;
+ case ETH_LINK_SPEED_10000:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_10G;
+ break;
+ case ETH_LINK_SPEED_40G:
+ hw_flags |= VNET_HW_INTERFACE_FLAG_SPEED_40G;
+ break;
+ case 0:
+ break;
+ default:
+ clib_warning("unknown link speed %d", xd->link.link_speed);
+ break;
+ }
+ }
+ if (hw_flags_chg)
+ {
+ if (LINK_STATE_ELOGS)
+ {
+ vlib_main_t * vm = vlib_get_main();
+
+ ELOG_TYPE_DECLARE(e) = {
+ .format = "update-link-state: sw_if_index %d, new flags %d",
+ .format_args = "i4i4",
+ };
+
+ struct { u32 sw_if_index; u32 flags; } *ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->sw_if_index = xd->vlib_sw_if_index;
+ ed->flags = hw_flags;
+ }
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, hw_flags);
+ }
+}
+
+static uword
+dpdk_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ clib_error_t * error;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ void *vu_state;
+ int i;
+
+ error = dpdk_lib_init (dm);
+
+ /*
+ * Turn on the input node if we found some devices to drive
+ * and we're not running worker threads or i/o threads
+ */
+
+ if (error == 0 && vec_len(dm->devices) > 0)
+ {
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (tm->main_thread_is_io_node)
+ vlib_node_set_state (vm, dpdk_io_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (!dm->have_io_threads)
+ for (i=0; i < tm->n_vlib_mains; i++)
+ if (vec_len(dm->devices_by_cpu[i]) > 0)
+ vlib_node_set_state (vlib_mains[i], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ }
+
+ if (error)
+ clib_error_report (error);
+
+ dpdk_vhost_user_process_init(&vu_state);
+
+ dm->io_thread_release = 1;
+
+ f64 now = vlib_time_now (vm);
+ vec_foreach (xd, dm->devices)
+ {
+ dpdk_update_link_state (xd, now);
+ }
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, 5.0);
+
+ if (dpdk_get_admin_up_down_in_progress())
+ /* skip the poll if an admin up down is in progress (on any interface) */
+ continue;
+
+ vec_foreach (xd, dm->devices)
+ {
+ f64 now = vlib_time_now (vm);
+ if ((now - xd->time_last_stats_update) >= DPDK_STATS_POLL_INTERVAL)
+ dpdk_update_counters (xd, now);
+ if ((now - xd->time_last_link_update) >= DPDK_LINK_POLL_INTERVAL)
+ dpdk_update_link_state (xd, now);
+
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
+ if (dpdk_vhost_user_process_if(vm, xd, vu_state) != 0)
+ continue;
+ }
+ }
+
+ dpdk_vhost_user_process_cleanup(vu_state);
+
+ return 0;
+}
+
+VLIB_REGISTER_NODE (dpdk_process_node,static) = {
+ .function = dpdk_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dpdk-process",
+ .process_log2_n_stack_bytes = 17,
+};
+
+clib_error_t *
+dpdk_init (vlib_main_t * vm)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_node_t * ei;
+ clib_error_t * error = 0;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ /* verify that structs are cacheline aligned */
+ ASSERT(offsetof(dpdk_device_t, cacheline0) == 0);
+ ASSERT(offsetof(dpdk_device_t, cacheline1) == CLIB_CACHE_LINE_BYTES);
+ ASSERT(offsetof(dpdk_worker_t, cacheline0) == 0);
+ ASSERT(offsetof(frame_queue_trace_t, cacheline0) == 0);
+
+ /* Add references to DPDK Driver Constructor functions to get the dynamic
+ * loader to pull in the driver library & run the constructors.
+ */
+#define _(d) \
+do { \
+ void devinitfn_ ##d(void); \
+ __attribute__((unused)) void (* volatile pf)(void); \
+ pf = devinitfn_ ##d; \
+} while(0);
+
+#ifdef RTE_LIBRTE_EM_PMD
+ _(em_pmd_drv)
+#endif
+
+#ifdef RTE_LIBRTE_IGB_PMD
+ _(pmd_igb_drv)
+#endif
+
+#ifdef RTE_LIBRTE_IXGBE_PMD
+ _(rte_ixgbe_driver)
+#endif
+
+#ifdef RTE_LIBRTE_I40E_PMD
+ _(rte_i40e_driver)
+ _(rte_i40evf_driver)
+#endif
+
+#ifdef RTE_LIBRTE_FM10K_PMD
+ _(rte_fm10k_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VIRTIO_PMD
+ _(rte_virtio_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VMXNET3_PMD
+ _(rte_vmxnet3_driver)
+#endif
+
+#ifdef RTE_LIBRTE_VICE_PMD
+ _(rte_vice_driver)
+#endif
+
+#ifdef RTE_LIBRTE_ENIC_PMD
+ _(rte_enic_driver)
+#endif
+
+#ifdef RTE_LIBRTE_PMD_AF_PACKET
+ _(pmd_af_packet_drv)
+#endif
+
+#undef _
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main();
+
+ ei = vlib_get_node_by_name (vm, (u8 *) "ethernet-input");
+ if (ei == 0)
+ return clib_error_return (0, "ethernet-input node AWOL");
+
+ dm->ethernet_input_node_index = ei->index;
+
+ dm->nchannels = 4;
+ dm->num_mbufs = dm->num_mbufs ? dm->num_mbufs : NB_MBUF;
+ vec_add1 (dm->eal_init_args, (u8 *) "vnet");
+
+ dm->dpdk_device_by_kni_port_id = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_listener_fd = hash_create (0, sizeof (uword));
+ dm->vu_sw_if_index_by_sock_fd = hash_create (0, sizeof (uword));
+
+ /* $$$ use n_thread_stacks since it's known-good at this point */
+ vec_validate (dm->recycle, tm->n_thread_stacks - 1);
+
+ /* initialize EFD (early fast discard) default settings */
+ dm->efd.enabled = DPDK_EFD_DISABLED;
+ dm->efd.queue_hi_thresh = ((DPDK_EFD_DEFAULT_DEVICE_QUEUE_HI_THRESH_PCT *
+ DPDK_NB_RX_DESC_10GE)/100);
+ dm->efd.consec_full_frames_hi_thresh =
+ DPDK_EFD_DEFAULT_CONSEC_FULL_FRAMES_HI_THRESH;
+
+ /* vhost-user coalescence frames defaults */
+ dm->vhost_coalesce_frames = 32;
+ dm->vhost_coalesce_time = 1e-3;
+
+ /* init CLI */
+ if ((error = vlib_call_init_function (vm, dpdk_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (dpdk_init);
+
diff --git a/vnet/vnet/devices/dpdk/node.c b/vnet/vnet/devices/dpdk/node.c
new file mode 100644
index 00000000000..fde0eb23e14
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/node.c
@@ -0,0 +1,2010 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/xxhash.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/mpls-gre/packet.h>
+
+#include "dpdk_priv.h"
+
+#ifndef MAX
+#define MAX(a,b) ((a) < (b) ? (b) : (a))
+#endif
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+/*
+ * At least in certain versions of ESXi, vmware e1000's don't honor the
+ * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY.
+ *
+ * If wireshark complains like so:
+ *
+ * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]"
+ * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY.
+ *
+ * Note: do NOT check in this file with this workaround enabled! You'll lose
+ * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy
+ * wireshark, but they're harmless...
+ */
+#define VMWARE_LENGTH_BUG_WORKAROUND 0
+
+typedef struct {
+ u32 cached_next_index;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} handoff_dispatch_main_t;
+
+typedef struct {
+ u32 buffer_index;
+ u32 next_index;
+ u32 sw_if_index;
+} handoff_dispatch_trace_t;
+
+/* packet trace format function */
+static u8 * format_handoff_dispatch_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ handoff_dispatch_trace_t * t = va_arg (*args, handoff_dispatch_trace_t *);
+
+ s = format (s, "HANDOFF_DISPATCH: sw_if_index %d next_index %d buffer 0x%x",
+ t->sw_if_index,
+ t->next_index,
+ t->buffer_index);
+ return s;
+}
+
+handoff_dispatch_main_t handoff_dispatch_main;
+
+vlib_node_registration_t handoff_dispatch_node;
+
+#define foreach_handoff_dispatch_error \
+_(EXAMPLE, "example packets")
+
+typedef enum {
+#define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym,
+ foreach_handoff_dispatch_error
+#undef _
+ HANDOFF_DISPATCH_N_ERROR,
+} handoff_dispatch_error_t;
+
+static char * handoff_dispatch_error_strings[] = {
+#define _(sym,string) string,
+ foreach_handoff_dispatch_error
+#undef _
+};
+
+static inline
+void vlib_put_handoff_queue_elt (vlib_frame_queue_elt_t * hf)
+{
+ CLIB_MEMORY_BARRIER();
+ hf->valid = 1;
+}
+
+static uword
+handoff_dispatch_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ dpdk_rx_next_t next_index;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ next0 = vnet_buffer(b0)->io_handoff.next_index;
+ next1 = vnet_buffer(b1)->io_handoff.next_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->buffer_index = bi0;
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next1, b1, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ t->buffer_index = bi1;
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ next0 = vnet_buffer(b0)->io_handoff.next_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ handoff_dispatch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->buffer_index = bi0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (handoff_dispatch_node) = {
+ .function = handoff_dispatch_node_fn,
+ .name = "handoff-dispatch",
+ .vector_size = sizeof (u32),
+ .format_trace = format_handoff_dispatch_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .flags = VLIB_NODE_FLAG_IS_HANDOFF,
+
+ .n_errors = ARRAY_LEN(handoff_dispatch_error_strings),
+ .error_strings = handoff_dispatch_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+clib_error_t *handoff_dispatch_init (vlib_main_t *vm)
+{
+ handoff_dispatch_main_t * mp = &handoff_dispatch_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = &vnet_main;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (handoff_dispatch_init);
+
+u32 dpdk_get_handoff_node_index (void)
+{
+ return handoff_dispatch_node.index;
+}
+
+static char * dpdk_error_strings[] = {
+#define _(n,s) s,
+ foreach_dpdk_error
+#undef _
+};
+
+typedef struct {
+ u32 buffer_index;
+ u16 device_index;
+ u16 queue_index;
+ struct rte_mbuf mb;
+ vlib_buffer_t buffer; /* Copy of VLIB buffer; pkt data stored in pre_data. */
+} dpdk_rx_dma_trace_t;
+
+static u8 * format_dpdk_rx_dma_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ dpdk_rx_dma_trace_t * t = va_arg (*va, dpdk_rx_dma_trace_t *);
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, t->device_index);
+ format_function_t * f;
+ uword indent = format_get_indent (s);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, xd->vlib_sw_if_index);
+
+ s = format (s, "%U rx queue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->queue_index);
+
+ s = format (s, "\n%Ubuffer 0x%x: %U",
+ format_white_space, indent,
+ t->buffer_index,
+ format_vlib_buffer, &t->buffer);
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rx_rte_mbuf, &t->mb);
+#else
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_dpdk_rte_mbuf, &t->mb);
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ f = node->format_buffer;
+ if (!f)
+ f = format_hex_bytes;
+ s = format (s, "\n%U%U", format_white_space, indent,
+ f, t->buffer.pre_data, sizeof (t->buffer.pre_data));
+
+ return s;
+}
+
+always_inline void
+dpdk_rx_next_and_error_from_mb_flags_x1 (dpdk_device_t *xd, struct rte_mbuf *mb,
+ vlib_buffer_t *b0,
+ u8 * next0, u8 * error0)
+{
+ u8 is0_ip4, is0_ip6, is0_mpls, n0;
+ uint16_t mb_flags = mb->ol_flags;
+
+ if (PREDICT_FALSE(mb_flags & (
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS |
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
+ )))
+ {
+ /* some error was flagged. determine the drop reason */
+ n0 = DPDK_RX_NEXT_DROP;
+ *error0 =
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR :
+ (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS :
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+ (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR :
+ (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR :
+ DPDK_ERROR_NONE;
+ }
+ else
+ {
+ *error0 = DPDK_ERROR_NONE;
+ if (xd->per_interface_next_index != ~0)
+ n0 = xd->per_interface_next_index;
+ else if (mb_flags & PKT_RX_VLAN_PKT)
+ n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
+ else
+ {
+ n0 = DPDK_RX_NEXT_ETHERNET_INPUT;
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+ is0_ip4 = (mb->packet_type & (RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L3_IPV4_EXT)) != 0;
+#else
+ is0_ip4 = (mb_flags & (PKT_RX_IPV4_HDR | PKT_RX_IPV4_HDR_EXT)) != 0;
+#endif
+
+ if (PREDICT_TRUE(is0_ip4))
+ n0 = DPDK_RX_NEXT_IP4_INPUT;
+ else
+ {
+#if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
+ is0_ip6 =
+ (mb->packet_type & (RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L3_IPV6_EXT)) != 0;
+#else
+ is0_ip6 =
+ (mb_flags & (PKT_RX_IPV6_HDR | PKT_RX_IPV6_HDR_EXT)) != 0;
+#endif
+ if (PREDICT_TRUE(is0_ip6))
+ n0 = DPDK_RX_NEXT_IP6_INPUT;
+ else
+ {
+ ethernet_header_t *h0 = (ethernet_header_t *) b0->data;
+ is0_mpls = (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST));
+ n0 = is0_mpls ? DPDK_RX_NEXT_MPLS_INPUT : n0;
+ }
+ }
+ }
+ }
+ *next0 = n0;
+}
+
+void dpdk_rx_trace (dpdk_main_t * dm,
+ vlib_node_runtime_t * node,
+ dpdk_device_t * xd,
+ u16 queue_id,
+ u32 * buffers,
+ uword n_buffers)
+{
+ vlib_main_t * vm = vlib_get_main();
+ u32 * b, n_left;
+ u8 next0;
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ dpdk_rx_dma_trace_t * t0;
+ struct rte_mbuf *mb;
+ u8 error0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ mb = ((struct rte_mbuf *)b0) - 1;
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+ vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->queue_index = queue_id;
+ t0->device_index = xd->device_index;
+ t0->buffer_index = bi0;
+
+ memcpy (&t0->mb, mb, sizeof (t0->mb));
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
+
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ b += 1;
+ }
+}
+
+/*
+ * dpdk_efd_update_counters()
+ * Update EFD (early-fast-discard) counters
+ */
+void dpdk_efd_update_counters (dpdk_device_t *xd,
+ u32 n_buffers,
+ u16 enabled)
+{
+ if (enabled & DPDK_EFD_MONITOR_ENABLED)
+ {
+ u64 now = clib_cpu_time_now();
+ if (xd->efd_agent.last_poll_time > 0)
+ {
+ u64 elapsed_time = (now - xd->efd_agent.last_poll_time);
+ if (elapsed_time > xd->efd_agent.max_poll_delay)
+ xd->efd_agent.max_poll_delay = elapsed_time;
+ }
+ xd->efd_agent.last_poll_time = now;
+ }
+
+ xd->efd_agent.total_packet_cnt += n_buffers;
+ xd->efd_agent.last_burst_sz = n_buffers;
+
+ if (n_buffers > xd->efd_agent.max_burst_sz)
+ xd->efd_agent.max_burst_sz = n_buffers;
+
+ if (PREDICT_FALSE(n_buffers == VLIB_FRAME_SIZE))
+ {
+ xd->efd_agent.full_frames_cnt++;
+ xd->efd_agent.consec_full_frames_cnt++;
+ }
+ else
+ {
+ xd->efd_agent.consec_full_frames_cnt = 0;
+ }
+}
+
+/* is_efd_discardable()
+ * returns non zero DPDK error if packet meets early-fast-discard criteria,
+ * zero otherwise
+ */
+u32 is_efd_discardable (vlib_thread_main_t *tm,
+ vlib_buffer_t * b0,
+ struct rte_mbuf *mb)
+{
+ ethernet_header_t *eh = (ethernet_header_t *) b0->data;
+
+ if (eh->type == clib_host_to_net_u16(ETHERNET_TYPE_IP4))
+ {
+ ip4_header_t *ipv4 =
+ (ip4_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_prec = (ipv4->tos >> 5);
+
+ return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ?
+ DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_IP6))
+ {
+ ip6_header_t *ipv6 =
+ (ip6_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_tclass =
+ ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff);
+
+ return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ?
+ DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_MPLS_UNICAST))
+ {
+ mpls_unicast_header_t *mpls =
+ (mpls_unicast_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07);
+
+ return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ?
+ DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+ else if ((eh->type == clib_net_to_host_u16(ETHERNET_TYPE_VLAN)) ||
+ (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_DOT1AD)))
+ {
+ ethernet_vlan_header_t *vlan =
+ (ethernet_vlan_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
+ u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07);
+
+ return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ?
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE);
+ }
+
+ return DPDK_ERROR_NONE;
+}
+
+/*
+ * This function is used when there are no worker threads.
+ * The main thread performs IO and forwards the packets.
+ */
+static inline u32 dpdk_device_input ( dpdk_main_t * dm,
+ dpdk_device_t * xd,
+ vlib_node_runtime_t * node,
+ u32 cpu_index,
+ u16 queue_id)
+{
+ u32 n_buffers;
+ u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
+ u32 n_left_to_next, * to_next;
+ u32 mb_index;
+ vlib_main_t * vm = vlib_get_main();
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u8 efd_discard_burst = 0;
+
+ if (xd->admin_up == 0)
+ return 0;
+
+ n_buffers = dpdk_rx_burst(dm, xd, queue_id);
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ return 0;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node);
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+
+ return 0;
+ }
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* discard all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ return 0;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ u32 cntr_type;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 2))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ ASSERT(mb);
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists. */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+
+ to_next[0] = bi0;
+ to_next++;
+ n_left_to_next--;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ b0->error = node->errors[error0];
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+ n_buffers--;
+ mb_index++;
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+
+ return mb_index;
+}
+
+#if VIRL > 0
+#define VIRL_SPEED_LIMIT() \
+ /* Limit the input rate to 1000 vectors / sec */ \
+ { \
+ struct timespec ts, tsrem; \
+ \
+ ts.tv_sec = 0; \
+ ts.tv_nsec = 1000*1000; /* 1ms */ \
+ \
+ while (nanosleep(&ts, &tsrem) < 0) \
+ { \
+ ts = tsrem; \
+ } \
+ }
+#else
+#define VIRL_SPEED_LIMIT()
+#endif
+
+
+static uword
+dpdk_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t * dq;
+ u32 cpu_index = os_get_cpu_number();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ ASSERT(dq->queue_id == 0);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0);
+ }
+
+ VIRL_SPEED_LIMIT()
+
+ return n_rx_packets;
+}
+
+uword
+dpdk_input_rss (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ uword n_rx_packets = 0;
+ dpdk_device_and_queue_t * dq;
+ u32 cpu_index = os_get_cpu_number();
+
+ /*
+ * Poll all devices on this cpu for input/interrupts.
+ */
+ vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+ {
+ xd = vec_elt_at_index(dm->devices, dq->device);
+ n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id);
+ }
+
+ VIRL_SPEED_LIMIT()
+
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (dpdk_input_node) = {
+ .function = dpdk_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+/*
+ * Override the next nodes for the dpdk input nodes.
+ * Must be invoked prior to VLIB_INIT_FUNCTION calls.
+ */
+void dpdk_set_next_node (dpdk_rx_next_t next, char *name)
+{
+ vlib_node_registration_t *r = &dpdk_input_node;
+ vlib_node_registration_t *r_io = &dpdk_io_input_node;
+ vlib_node_registration_t *r_handoff = &handoff_dispatch_node;
+
+ switch (next)
+ {
+ case DPDK_RX_NEXT_IP4_INPUT:
+ case DPDK_RX_NEXT_IP6_INPUT:
+ case DPDK_RX_NEXT_MPLS_INPUT:
+ case DPDK_RX_NEXT_ETHERNET_INPUT:
+ r->next_nodes[next] = name;
+ r_io->next_nodes[next] = name;
+ r_handoff->next_nodes[next] = name;
+ break;
+
+ default:
+ clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
+ break;
+ }
+}
+
+inline vlib_frame_queue_elt_t *
+vlib_get_handoff_queue_elt (u32 vlib_worker_index)
+{
+ vlib_frame_queue_t *fq;
+ vlib_frame_queue_elt_t *elt;
+ u64 new_tail;
+
+ fq = vlib_frame_queues[vlib_worker_index];
+ ASSERT (fq);
+
+ new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+ /* Wait until a ring slot is available */
+ while (new_tail >= fq->head_hint + fq->nelts)
+ vlib_worker_thread_barrier_check ();
+
+ elt = fq->elts + (new_tail & (fq->nelts-1));
+
+ /* this would be very bad... */
+ while (elt->valid)
+ ;
+
+ elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME;
+ elt->last_n_vectors = elt->n_vectors = 0;
+
+ return elt;
+}
+
+inline vlib_frame_queue_elt_t *
+dpdk_get_handoff_queue_elt (
+ u32 vlib_worker_index,
+ vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index)
+{
+ vlib_frame_queue_elt_t *elt;
+
+ if (handoff_queue_elt_by_worker_index [vlib_worker_index])
+ return handoff_queue_elt_by_worker_index [vlib_worker_index];
+
+ elt = vlib_get_handoff_queue_elt (vlib_worker_index);
+
+ handoff_queue_elt_by_worker_index [vlib_worker_index] = elt;
+
+ return elt;
+}
+
+static inline vlib_frame_queue_t *
+is_vlib_handoff_queue_congested (
+ u32 vlib_worker_index,
+ u32 queue_hi_thresh,
+ vlib_frame_queue_t ** handoff_queue_by_worker_index)
+{
+ vlib_frame_queue_t *fq;
+
+ fq = handoff_queue_by_worker_index [vlib_worker_index];
+ if (fq != (vlib_frame_queue_t *)(~0))
+ return fq;
+
+ fq = vlib_frame_queues[vlib_worker_index];
+ ASSERT (fq);
+
+ if (PREDICT_FALSE(fq->tail >= (fq->head_hint + queue_hi_thresh))) {
+ /* a valid entry in the array will indicate the queue has reached
+ * the specified threshold and is congested
+ */
+ handoff_queue_by_worker_index [vlib_worker_index] = fq;
+ fq->enqueue_full_events++;
+ return fq;
+ }
+
+ return NULL;
+}
+
+static inline u64 ipv4_get_key (ip4_header_t *ip)
+{
+ u64 hash_key;
+
+ hash_key = *((u64*)(&ip->address_pair)) ^ ip->protocol;
+
+ return hash_key;
+}
+
+static inline u64 ipv6_get_key (ip6_header_t *ip)
+{
+ u64 hash_key;
+
+ hash_key = ip->src_address.as_u64[0] ^
+ ip->src_address.as_u64[1] ^
+ ip->dst_address.as_u64[0] ^
+ ip->dst_address.as_u64[1] ^
+ ip->protocol;
+
+ return hash_key;
+}
+
+
+#define MPLS_BOTTOM_OF_STACK_BIT_MASK 0x00000100U
+#define MPLS_LABEL_MASK 0xFFFFF000U
+
+static inline u64 mpls_get_key (mpls_unicast_header_t *m)
+{
+ u64 hash_key;
+ u8 ip_ver;
+
+
+ /* find the bottom of the MPLS label stack. */
+ if (PREDICT_TRUE(m->label_exp_s_ttl &
+ clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (PREDICT_TRUE(m->label_exp_s_ttl &
+ clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+ m++;
+
+ if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
+ goto bottom_lbl_found;
+ }
+
+ /* the bottom label was not found - use the last label */
+ hash_key = m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
+
+ return hash_key;
+
+
+bottom_lbl_found:
+ m++;
+ ip_ver = (*((u8 *)m) >> 4);
+
+ /* find out if it is IPV4 or IPV6 header */
+ if (PREDICT_TRUE(ip_ver == 4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)m);
+ } else if (PREDICT_TRUE(ip_ver == 6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)m);
+ } else {
+ /* use the bottom label */
+ hash_key = (m-1)->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
+ }
+
+ return hash_key;
+
+}
+
+static inline u64 eth_get_key (ethernet_header_t *h0)
+{
+ u64 hash_key;
+
+
+ if (PREDICT_TRUE(h0->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)(h0+1));
+ } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)(h0+1));
+ } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
+ hash_key = mpls_get_key((mpls_unicast_header_t *)(h0+1));
+ } else if ((h0->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ||
+ (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_DOT1AD))) {
+ ethernet_vlan_header_t * outer = (ethernet_vlan_header_t *)(h0 + 1);
+
+ outer = (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ?
+ outer+1 : outer;
+ if (PREDICT_TRUE(outer->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
+ hash_key = ipv4_get_key((ip4_header_t *)(outer+1));
+ } else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)) {
+ hash_key = ipv6_get_key((ip6_header_t *)(outer+1));
+ } else if (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
+ hash_key = mpls_get_key((mpls_unicast_header_t *)(outer+1));
+ } else {
+ hash_key = outer->type;
+ }
+ } else {
+ hash_key = 0;
+ }
+
+ return hash_key;
+}
+
+/*
+ * This function is used when dedicated IO threads feed the worker threads.
+ *
+ * Devices are allocated to this thread based on instances and instance_id.
+ * If instances==0 then the function automatically determines the number
+ * of instances of this thread, and allocates devices between them.
+ * If instances != 0, then instance_id must be in the range 0..instances-1.
+ * The function allocates devices among the specified number of instances,
+ * with this thread having the given instance id. This option is used for
+ * splitting devices among differently named "io"-type threads.
+ */
+void dpdk_io_thread (vlib_worker_thread_t * w,
+ u32 instances,
+ u32 instance_id,
+ char *worker_name,
+ dpdk_io_thread_callback_t callback)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_thread_registration_t * tr;
+ dpdk_main_t * dm = &dpdk_main;
+ char *io_name = w->registration->name;
+ dpdk_device_t * xd;
+ dpdk_device_t ** my_devices = 0;
+ vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index = 0;
+ vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
+ vlib_frame_queue_elt_t * hf = 0;
+ int i;
+ u32 n_left_to_next_worker = 0, * to_next_worker = 0;
+ u32 next_worker_index = 0;
+ u32 current_worker_index = ~0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 num_workers = 0;
+ u32 num_devices = 0;
+ uword * p;
+ u16 queue_id = 0;
+ vlib_node_runtime_t * node_trace;
+ u32 first_worker_index = 0;
+
+ /* Wait until the dpdk init sequence is complete */
+ while (dm->io_thread_release == 0)
+ vlib_worker_thread_barrier_check();
+
+ clib_time_init (&vm->clib_time);
+
+ p = hash_get_mem (tm->thread_registrations_by_name, worker_name);
+ ASSERT (p);
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+
+ /* Allocate devices to this thread */
+ if (instances == 0)
+ {
+ /* auto-assign */
+ instance_id = w->instance_id;
+
+ p = hash_get_mem (tm->thread_registrations_by_name, io_name);
+ tr = (vlib_thread_registration_t *) p[0];
+ /* Otherwise, how did we get here */
+ ASSERT (tr && tr->count);
+ instances = tr->count;
+ }
+ else
+ {
+ /* manually assign */
+ ASSERT (instance_id < instances);
+ }
+
+ vec_validate (handoff_queue_elt_by_worker_index,
+ first_worker_index + num_workers - 1);
+
+ vec_validate_init_empty (congested_handoff_queue_by_worker_index,
+ first_worker_index + num_workers - 1,
+ (vlib_frame_queue_t *)(~0));
+
+ /* packet tracing is triggered on the dpdk-input node for ease-of-use */
+ node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
+
+ /* And handle them... */
+ while (1)
+ {
+ u32 n_buffers;
+ u32 mb_index;
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u32 hash;
+ u64 hash_key;
+ u8 efd_discard_burst;
+
+ vlib_worker_thread_barrier_check ();
+
+ /* Invoke callback if supplied */
+ if (PREDICT_FALSE(callback != NULL))
+ callback(vm);
+
+ if (PREDICT_FALSE(vec_len(dm->devices) != num_devices))
+ {
+ vec_reset_length(my_devices);
+ vec_foreach (xd, dm->devices)
+ {
+ if (((xd - dm->devices) % tr->count) == instance_id)
+ {
+ fprintf(stderr, "i/o thread %d (cpu %d) takes port %d\n",
+ instance_id, (int) os_get_cpu_number(), (int) (xd - dm->devices));
+ vec_add1 (my_devices, xd);
+ }
+ }
+ num_devices = vec_len(dm->devices);
+ }
+
+ for (i = 0; i < vec_len (my_devices); i++)
+ {
+ xd = my_devices[i];
+
+ if (!xd->admin_up)
+ continue;
+
+ n_buffers = dpdk_rx_burst(dm, xd, 0 /* queue_id */);
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ continue;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+ continue;
+ }
+
+ /* reset EFD action for the burst */
+ efd_discard_burst = 0;
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* drop all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ continue;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ fl = vlib_buffer_get_free_list
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ ethernet_header_t * h0;
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 1))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ if (error0)
+ clib_warning ("bi %d error %d", bi0, error0);
+
+ b0->error = 0;
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ vnet_buffer(b0)->io_handoff.next_index = next0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+
+ next_worker_index = first_worker_index;
+
+ /*
+ * Force unknown traffic onto worker 0,
+ * and into ethernet-input. $$$$ add more hashes.
+ */
+ h0 = (ethernet_header_t *) b0->data;
+
+ /* Compute ingress LB hash */
+ hash_key = eth_get_key(h0);
+ hash = (u32)clib_xxhash(hash_key);
+
+ if (PREDICT_TRUE (is_pow2(num_workers)))
+ next_worker_index += hash & (num_workers - 1);
+ else
+ next_worker_index += hash % num_workers;
+
+ /* if EFD is enabled and not already discarding from dpdk,
+ * check the worker ring/queue for congestion
+ */
+ if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
+ {
+ vlib_frame_queue_t *fq;
+
+ /* fq will be valid if the ring is congested */
+ fq = is_vlib_handoff_queue_congested(
+ next_worker_index, tm->efd.queue_hi_thresh,
+ congested_handoff_queue_by_worker_index);
+
+ if (PREDICT_FALSE(fq != NULL))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type =
+ is_efd_discardable(tm, b0, mb)))
+ {
+ /* discard the packet */
+ fq->enqueue_efd_discards++;
+ increment_efd_drop_counter(vm, cntr_type, 1);
+ rte_pktmbuf_free(mb);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+ }
+
+ if (next_worker_index != current_worker_index)
+ {
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ hf = dpdk_get_handoff_queue_elt(
+ next_worker_index,
+ handoff_queue_elt_by_worker_index);
+
+ n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
+ to_next_worker = &hf->buffer_index[hf->n_vectors];
+ current_worker_index = next_worker_index;
+ }
+
+ /* enqueue to correct worker thread */
+ to_next_worker[0] = bi0;
+ to_next_worker++;
+ n_left_to_next_worker--;
+
+ if (n_left_to_next_worker == 0)
+ {
+ hf->n_vectors = VLIB_FRAME_SIZE;
+ vlib_put_handoff_queue_elt(hf);
+ current_worker_index = ~0;
+ handoff_queue_elt_by_worker_index[next_worker_index] = 0;
+ hf = 0;
+ }
+
+ n_buffers--;
+ mb_index++;
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ /* credit the trace to the trace node */
+ dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+ }
+
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ /* Ship frames to the worker nodes */
+ for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
+ {
+ if (handoff_queue_elt_by_worker_index[i])
+ {
+ hf = handoff_queue_elt_by_worker_index[i];
+ /*
+ * It works better to let the handoff node
+ * rate-adapt, always ship the handoff queue element.
+ */
+ if (1 || hf->n_vectors == hf->last_n_vectors)
+ {
+ vlib_put_handoff_queue_elt(hf);
+ handoff_queue_elt_by_worker_index[i] = 0;
+ }
+ else
+ hf->last_n_vectors = hf->n_vectors;
+ }
+ congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
+ }
+ hf = 0;
+ current_worker_index = ~0;
+
+ vlib_increment_main_loop_counter (vm);
+ }
+}
+
+/*
+ * This function is used when the main thread performs IO and feeds the
+ * worker threads.
+ */
+static uword
+dpdk_io_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ uword n_rx_packets = 0;
+ static vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index;
+ static vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
+ vlib_frame_queue_elt_t * hf = 0;
+ int i;
+ u32 n_left_to_next_worker = 0, * to_next_worker = 0;
+ u32 next_worker_index = 0;
+ u32 current_worker_index = ~0;
+ u32 cpu_index = os_get_cpu_number();
+ static int num_workers_set;
+ static u32 num_workers;
+ u16 queue_id = 0;
+ vlib_node_runtime_t * node_trace;
+ static u32 first_worker_index;
+
+ if (PREDICT_FALSE(num_workers_set == 0))
+ {
+ uword * p;
+ vlib_thread_registration_t * tr;
+ /* Only the standard vnet worker threads are supported */
+ p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+ tr = (vlib_thread_registration_t *) p[0];
+ if (tr)
+ {
+ num_workers = tr->count;
+ first_worker_index = tr->first_index;
+ }
+ num_workers_set = 1;
+ }
+
+ if (PREDICT_FALSE(handoff_queue_elt_by_worker_index == 0))
+ {
+ vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
+
+ vec_validate_init_empty (congested_handoff_queue_by_worker_index,
+ first_worker_index + num_workers - 1,
+ (vlib_frame_queue_t *)(~0));
+ }
+
+ /* packet tracing is triggered on the dpdk-input node for ease-of-use */
+ node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
+
+ vec_foreach (xd, dm->devices)
+ {
+ u32 n_buffers;
+ u32 mb_index;
+ uword n_rx_bytes = 0;
+ u32 n_trace, trace_cnt __attribute__((unused));
+ vlib_buffer_free_list_t * fl;
+ u32 hash;
+ u64 hash_key;
+ u8 efd_discard_burst = 0;
+
+ if (!xd->admin_up)
+ continue;
+
+ n_buffers = dpdk_rx_burst(dm, xd, queue_id );
+
+ if (n_buffers == 0)
+ {
+ /* check if EFD (dpdk) is enabled */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* reset a few stats */
+ xd->efd_agent.last_poll_time = 0;
+ xd->efd_agent.last_burst_sz = 0;
+ }
+ continue;
+ }
+
+ vec_reset_length (xd->d_trace_buffers);
+ trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
+
+ /*
+ * DAW-FIXME: VMXNET3 device stop/start doesn't work,
+ * therefore fake the stop in the dpdk driver by
+ * silently dropping all of the incoming pkts instead of
+ * stopping the driver / hardware.
+ */
+ if (PREDICT_FALSE(xd->admin_up != 1))
+ {
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
+ continue;
+ }
+
+ /* Check for congestion if EFD (Early-Fast-Discard) is enabled
+ * in any mode (e.g. dpdk, monitor, or drop_all)
+ */
+ if (PREDICT_FALSE(dm->efd.enabled))
+ {
+ /* update EFD counters */
+ dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
+
+ if (PREDICT_FALSE(dm->efd.enabled & DPDK_EFD_DROPALL_ENABLED))
+ {
+ /* discard all received packets */
+ for (mb_index = 0; mb_index < n_buffers; mb_index++)
+ rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
+
+ xd->efd_agent.discard_cnt += n_buffers;
+ increment_efd_drop_counter(vm,
+ DPDK_ERROR_VLAN_EFD_DROP_PKTS,
+ n_buffers);
+
+ continue;
+ }
+
+ if (PREDICT_FALSE(xd->efd_agent.consec_full_frames_cnt >=
+ dm->efd.consec_full_frames_hi_thresh))
+ {
+ u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
+ queue_id);
+ if (device_queue_sz >= dm->efd.queue_hi_thresh)
+ {
+ /* dpdk device queue has reached the critical threshold */
+ xd->efd_agent.congestion_cnt++;
+
+ /* apply EFD to packets from the burst */
+ efd_discard_burst = 1;
+ }
+ }
+ }
+
+ fl = vlib_buffer_get_free_list
+ (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ mb_index = 0;
+
+ while (n_buffers > 0)
+ {
+ u32 bi0;
+ u8 next0, error0;
+ u32 l3_offset0;
+ vlib_buffer_t * b0, * b_seg, * b_chain = 0;
+ ethernet_header_t * h0;
+ u8 nb_seg = 1;
+ struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
+ struct rte_mbuf *mb_seg = mb->next;
+
+ if (PREDICT_TRUE(n_buffers > 1))
+ {
+ struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (bp->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ b0 = (vlib_buffer_t *)(mb+1);
+
+ /* check whether EFD is looking for packets to discard */
+ if (PREDICT_FALSE(efd_discard_burst))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
+ {
+ rte_pktmbuf_free(mb);
+ xd->efd_agent.discard_cnt++;
+ increment_efd_drop_counter(vm,
+ cntr_type,
+ 1);
+
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+
+ /* Prefetch one next segment if it exists */
+ if (PREDICT_FALSE(mb->nb_segs > 1))
+ {
+ struct rte_mbuf *pfmb = mb->next;
+ vlib_buffer_t *bp = (vlib_buffer_t *)(pfmb+1);
+ CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
+ b_chain = b0;
+ }
+
+ bi0 = vlib_get_buffer_index (vm, b0);
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0,
+ &next0, &error0);
+#ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
+ /*
+ * Clear overloaded TX offload flags when a DPDK driver
+ * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
+ */
+ if (PREDICT_TRUE(trace_cnt == 0))
+ mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
+ else
+ trace_cnt--;
+#endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
+
+ if (error0)
+ clib_warning ("bi %d error %d", bi0, error0);
+
+ b0->error = 0;
+
+ l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
+ next0 == DPDK_RX_NEXT_IP6_INPUT ||
+ next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ b0->current_data = l3_offset0;
+ b0->current_length = mb->data_len - l3_offset0;
+
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ if (VMWARE_LENGTH_BUG_WORKAROUND)
+ b0->current_length -= 4;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ vnet_buffer(b0)->io_handoff.next_index = next0;
+ n_rx_bytes += mb->pkt_len;
+
+ /* Process subsequent segments of multi-segment packets */
+ while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
+ {
+ ASSERT(mb_seg != 0);
+
+ b_seg = (vlib_buffer_t *)(mb_seg+1);
+ vlib_buffer_init_for_free_list (b_seg, fl);
+ b_seg->clone_count = 0;
+
+ ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+ ASSERT(b_seg->current_data == 0);
+
+ /*
+ * The driver (e.g. virtio) may not put the packet data at the start
+ * of the segment, so don't assume b_seg->current_data == 0 is correct.
+ */
+ b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
+
+ b_seg->current_length = mb_seg->data_len;
+ b0->total_length_not_including_first_buffer +=
+ mb_seg->data_len;
+
+ b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
+
+ b_chain = b_seg;
+ mb_seg = mb_seg->next;
+ nb_seg++;
+ }
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ if (PREDICT_FALSE (n_trace > mb_index))
+ vec_add1 (xd->d_trace_buffers, bi0);
+
+ next_worker_index = first_worker_index;
+
+ /*
+ * Force unknown traffic onto worker 0,
+ * and into ethernet-input. $$$$ add more hashes.
+ */
+ h0 = (ethernet_header_t *) b0->data;
+
+ /* Compute ingress LB hash */
+ hash_key = eth_get_key(h0);
+ hash = (u32)clib_xxhash(hash_key);
+
+ if (PREDICT_TRUE (is_pow2(num_workers)))
+ next_worker_index += hash & (num_workers - 1);
+ else
+ next_worker_index += hash % num_workers;
+
+ /* if EFD is enabled and not already discarding from dpdk,
+ * check the worker ring/queue for congestion
+ */
+ if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
+ {
+ vlib_frame_queue_t *fq;
+
+ /* fq will be valid if the ring is congested */
+ fq = is_vlib_handoff_queue_congested(
+ next_worker_index, tm->efd.queue_hi_thresh,
+ congested_handoff_queue_by_worker_index);
+
+ if (PREDICT_FALSE(fq != NULL))
+ {
+ u32 cntr_type;
+ if (PREDICT_TRUE(cntr_type =
+ is_efd_discardable(tm, b0, mb)))
+ {
+ /* discard the packet */
+ fq->enqueue_efd_discards++;
+ increment_efd_drop_counter(vm, cntr_type, 1);
+ rte_pktmbuf_free(mb);
+ n_buffers--;
+ mb_index++;
+ continue;
+ }
+ }
+ }
+
+ if (next_worker_index != current_worker_index)
+ {
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ hf = dpdk_get_handoff_queue_elt(
+ next_worker_index,
+ handoff_queue_elt_by_worker_index);
+
+ n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
+ to_next_worker = &hf->buffer_index[hf->n_vectors];
+ current_worker_index = next_worker_index;
+ }
+
+ /* enqueue to correct worker thread */
+ to_next_worker[0] = bi0;
+ to_next_worker++;
+ n_left_to_next_worker--;
+
+ if (n_left_to_next_worker == 0)
+ {
+ hf->n_vectors = VLIB_FRAME_SIZE;
+ vlib_put_handoff_queue_elt(hf);
+ current_worker_index = ~0;
+ handoff_queue_elt_by_worker_index[next_worker_index] = 0;
+ hf = 0;
+ }
+
+ n_buffers--;
+ mb_index++;
+ }
+
+ if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
+ {
+ /* credit the trace to the trace node */
+ dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
+ vec_len (xd->d_trace_buffers));
+ vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
+ }
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ xd->vlib_sw_if_index,
+ mb_index, n_rx_bytes);
+
+ dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
+ dw->aggregate_rx_packets += mb_index;
+ n_rx_packets += mb_index;
+ }
+
+ if (hf)
+ hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
+
+ /* Ship frames to the worker nodes */
+ for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
+ {
+ if (handoff_queue_elt_by_worker_index[i])
+ {
+ hf = handoff_queue_elt_by_worker_index[i];
+ /*
+ * It works better to let the handoff node
+ * rate-adapt, always ship the handoff queue element.
+ */
+ if (1 || hf->n_vectors == hf->last_n_vectors)
+ {
+ vlib_put_handoff_queue_elt(hf);
+ handoff_queue_elt_by_worker_index[i] = 0;
+ }
+ else
+ hf->last_n_vectors = hf->n_vectors;
+ }
+ congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
+ }
+ hf = 0;
+ current_worker_index = ~0;
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (dpdk_io_input_node) = {
+ .function = dpdk_io_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "dpdk-io-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_dpdk_rx_dma_trace,
+
+ .n_errors = DPDK_N_ERROR,
+ .error_strings = dpdk_error_strings,
+
+ .n_next_nodes = DPDK_RX_N_NEXT,
+ .next_nodes = {
+ [DPDK_RX_NEXT_DROP] = "error-drop",
+ [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
+/*
+ * set_efd_bitmap()
+ * Based on the operation type, set lower/upper bits for the given index value
+ */
+void
+set_efd_bitmap (u8 *bitmap, u32 value, u32 op)
+{
+ int ix;
+
+ *bitmap = 0;
+ for (ix = 0; ix < 8; ix++) {
+ if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) ||
+ ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value))){
+ (*bitmap) |= (1 << ix);
+ }
+ }
+}
+
+void
+efd_config (u32 enabled,
+ u32 ip_prec, u32 ip_op,
+ u32 mpls_exp, u32 mpls_op,
+ u32 vlan_cos, u32 vlan_op)
+{
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ if (enabled) {
+ tm->efd.enabled |= VLIB_EFD_DISCARD_ENABLED;
+ dm->efd.enabled |= DPDK_EFD_DISCARD_ENABLED;
+ } else {
+ tm->efd.enabled &= ~VLIB_EFD_DISCARD_ENABLED;
+ dm->efd.enabled &= ~DPDK_EFD_DISCARD_ENABLED;
+ }
+
+ set_efd_bitmap(&tm->efd.ip_prec_bitmap, ip_prec, ip_op);
+ set_efd_bitmap(&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
+ set_efd_bitmap(&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
+
+}
diff --git a/vnet/vnet/devices/dpdk/threads.c b/vnet/vnet/devices/dpdk/threads.c
new file mode 100644
index 00000000000..aa32f1007c3
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/threads.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <signal.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vlibmemory/api.h>
+#include <vlibmemory/vl_memory_msg_enum.h> /* enumerate all vlib messages */
+
+#define vl_typedefs /* define message structures */
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_typedefs
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
+#define vl_printfun
+#include <vlibmemory/vl_memory_api_h.h>
+#undef vl_printfun
+
+vlib_thread_main_t vlib_thread_main;
+
+frame_queue_trace_t *frame_queue_traces;
+
+/*
+ * Check the frame queue to see if any frames are available.
+ * If so, pull the packets off the frames and put them to
+ * the handoff node.
+ */
+static inline int vlib_frame_queue_dequeue_internal (vlib_main_t *vm)
+{
+ u32 thread_id = vm->cpu_index;
+ vlib_frame_queue_t *fq = vlib_frame_queues[thread_id];
+ vlib_frame_queue_elt_t *elt;
+ u32 * from, * to;
+ vlib_frame_t * f;
+ int msg_type;
+ int processed = 0;
+ u32 n_left_to_node;
+ u32 vectors = 0;
+
+ ASSERT (fq);
+ ASSERT(vm == vlib_mains[thread_id]);
+
+ /*
+ * Gather trace data for frame queues
+ */
+ if (PREDICT_FALSE(fq->trace))
+ {
+ frame_queue_trace_t *fqt;
+ u32 elix;
+
+ fqt = &frame_queue_traces[thread_id];
+ fqt->nelts = fq->nelts;
+ fqt->head = fq->head;
+ fqt->head_hint = fq->head_hint;
+ fqt->tail = fq->tail;
+ fqt->threshold = fq->vector_threshold;
+ fqt->n_in_use = fqt->tail - fqt->head;
+ if (fqt->n_in_use > fqt->nelts){
+ fqt->n_in_use = 0;
+ }
+
+ for (elix=0; elix<fqt->nelts; elix++) {
+ elt = fq->elts + ((fq->head+1 + elix) & (fq->nelts-1));
+ if (1 || elt->valid)
+ {
+ fqt->n_vectors[elix] = elt->n_vectors;
+ }
+ }
+ fqt->written = 1;
+ }
+
+ while (1)
+ {
+ if (fq->head == fq->tail)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ elt = fq->elts + ((fq->head+1) & (fq->nelts-1));
+
+ if (!elt->valid)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+
+ from = elt->buffer_index;
+ msg_type = elt->msg_type;
+
+ ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME);
+ ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE);
+
+ f = vlib_get_frame_to_node
+ (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index);
+
+ to = vlib_frame_vector_args (f);
+
+ n_left_to_node = elt->n_vectors;
+
+ while (n_left_to_node >= 4)
+ {
+ to[0] = from[0];
+ to[1] = from[1];
+ to[2] = from[2];
+ to[3] = from[3];
+ to += 4;
+ from += 4;
+ n_left_to_node -= 4;
+ }
+
+ while (n_left_to_node > 0)
+ {
+ to[0] = from[0];
+ to++;
+ from++;
+ n_left_to_node--;
+ }
+
+ vectors += elt->n_vectors;
+ f->n_vectors = elt->n_vectors;
+ vlib_put_frame_to_node
+ (vm, 1 ? handoff_dispatch_node.index : ethernet_input_node.index, f);
+
+ elt->valid = 0;
+ elt->n_vectors = 0;
+ elt->msg_type = 0xfefefefe;
+ CLIB_MEMORY_BARRIER();
+ fq->head++;
+ processed++;
+
+ /*
+ * Limit the number of packets pushed into the graph
+ */
+ if (vectors >= fq->vector_threshold)
+ {
+ fq->head_hint = fq->head;
+ return processed;
+ }
+ }
+ ASSERT(0);
+ return processed;
+}
+
+int dpdk_frame_queue_dequeue (vlib_main_t *vm)
+{
+ return vlib_frame_queue_dequeue_internal (vm);
+}
+
+/*
+ * dpdk_worker_thread - Contains the main loop of a worker thread.
+ *
+ * w
+ * Information for the current thread
+ * io_name
+ * The name of thread performing dpdk device IO (if any). If there are no
+ * instances of that thread, then the current thread will do dpdk device
+ * polling. Ports will be divided among instances of the current thread.
+ * callback
+ * If not null, this function will be called once during each main loop.
+ */
+static_always_inline void
+dpdk_worker_thread_internal (vlib_main_t *vm,
+ dpdk_worker_thread_callback_t callback,
+ int have_io_threads)
+{
+ vlib_node_main_t * nm = &vm->node_main;
+ u64 cpu_time_now = clib_cpu_time_now ();
+
+ while (1)
+ {
+ vlib_worker_thread_barrier_check ();
+
+ vlib_frame_queue_dequeue_internal (vm);
+
+ /* Invoke callback if supplied */
+ if (PREDICT_FALSE(callback != NULL))
+ callback(vm);
+
+ if (!have_io_threads)
+ {
+ vlib_node_runtime_t * n;
+ vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+ {
+ cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
+ VLIB_NODE_STATE_POLLING, /* frame */ 0,
+ cpu_time_now);
+ }
+
+ }
+
+ if (_vec_len (nm->pending_frames))
+ {
+ int i;
+ cpu_time_now = clib_cpu_time_now ();
+ for (i = 0; i < _vec_len (nm->pending_frames); i++) {
+ vlib_pending_frame_t *p;
+
+ p = nm->pending_frames + i;
+
+ cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now);
+ }
+ _vec_len (nm->pending_frames) = 0;
+ }
+ vlib_increment_main_loop_counter (vm);
+
+ /* Record time stamp in case there are no enabled nodes and above
+ calls do not update time stamp. */
+ cpu_time_now = clib_cpu_time_now ();
+ }
+}
+
+void dpdk_worker_thread (vlib_worker_thread_t * w,
+ char *io_name,
+ dpdk_worker_thread_callback_t callback)
+{
+ vlib_main_t *vm;
+ uword * p;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vlib_thread_registration_t * tr;
+ dpdk_main_t * dm = &dpdk_main;
+
+ vm = vlib_get_main();
+
+ ASSERT(vm->cpu_index == os_get_cpu_number());
+
+ clib_time_init (&vm->clib_time);
+ clib_mem_set_heap (w->thread_mheap);
+
+ /* Wait until the dpdk init sequence is complete */
+ while (dm->io_thread_release == 0)
+ vlib_worker_thread_barrier_check ();
+
+ /* any I/O threads? */
+ p = hash_get_mem (tm->thread_registrations_by_name, io_name);
+ tr = (vlib_thread_registration_t *)p[0];
+
+ if (tr && tr->count > 0)
+ dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 1);
+ else
+ dpdk_worker_thread_internal(vm, callback, /* have_io_threads */ 0);
+}
+
+void dpdk_worker_thread_fn (void * arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_worker_thread (w, "io", 0);
+}
+
+#if VIRL == 0
+VLIB_REGISTER_THREAD (worker_thread_reg, static) = {
+ .name = "workers",
+ .short_name = "wk",
+ .function = dpdk_worker_thread_fn,
+ .mheap_size = 256<<20,
+};
+#endif
+
+void dpdk_io_thread_fn (void * arg)
+{
+ vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+ vlib_worker_thread_init (w);
+ dpdk_io_thread (w, 0, 0, "workers", 0);
+}
+
+#if VIRL == 0
+VLIB_REGISTER_THREAD (io_thread_reg, static) = {
+ .name = "io",
+ .short_name = "io",
+ .function = dpdk_io_thread_fn,
+ .mheap_size = 256<<20,
+};
+#endif
+
+static void vl_api_rpc_call_t_handler (vl_api_rpc_call_t * mp)
+{
+ vl_api_rpc_reply_t * rmp;
+ int (*fp)(void *);
+ i32 rv = 0;
+ vlib_main_t * vm = vlib_get_main();
+
+ if (mp->function == 0)
+ {
+ rv = -1;
+ clib_warning ("rpc NULL function pointer");
+ }
+
+ else
+ {
+ if (mp->need_barrier_sync)
+ vlib_worker_thread_barrier_sync (vm);
+
+ fp = (void *)(mp->function);
+ rv = (*fp)(mp->data);
+
+ if (mp->need_barrier_sync)
+ vlib_worker_thread_barrier_release (vm);
+ }
+
+ if (mp->send_reply)
+ {
+ unix_shared_memory_queue_t * q =
+ vl_api_client_index_to_input_queue (mp->client_index);
+ if (q)
+ {
+ rmp = vl_msg_api_alloc_as_if_client (sizeof (*rmp));
+ rmp->_vl_msg_id = ntohs (VL_API_RPC_REPLY);
+ rmp->context = mp->context;
+ rmp->retval = rv;
+ vl_msg_api_send_shmem (q, (u8 *)&rmp);
+ }
+ }
+ if (mp->multicast)
+ {
+ clib_warning ("multicast not yet implemented...");
+ }
+}
+
+static void vl_api_rpc_reply_t_handler (vl_api_rpc_reply_t * mp)
+{ clib_warning ("unimplemented"); }
+
+void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length)
+{
+ vl_api_rpc_call_t * mp;
+ api_main_t *am = &api_main;
+ vl_shmem_hdr_t *shmem_hdr = am->shmem_hdr;
+
+ mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + data_length);
+ memset (mp, 0, sizeof (*mp));
+ memcpy (mp->data, data, data_length);
+ mp->_vl_msg_id = ntohs (VL_API_RPC_CALL);
+ mp->function = (u64)fp;
+ mp->need_barrier_sync = 1;
+
+ /* Use the "normal" control-plane mechanism for the main thread */
+ vl_msg_api_send_shmem (shmem_hdr->vl_input_queue, (u8 *)&mp);
+}
+
+
+#define foreach_rpc_api_msg \
+_(RPC_CALL,rpc_call) \
+_(RPC_REPLY,rpc_reply)
+
+static clib_error_t *
+rpc_api_hookup (vlib_main_t *vm)
+{
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 0 /* do not trace */);
+ foreach_rpc_api_msg;
+#undef _
+ return 0;
+}
+
+VLIB_API_INIT_FUNCTION(rpc_api_hookup);
diff --git a/vnet/vnet/devices/dpdk/threads.h b/vnet/vnet/devices/dpdk/threads.h
new file mode 100644
index 00000000000..8f0fcbdb465
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/threads.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_dpdk_threads_h__
+#define __included_dpdk_threads_h__
+
+#include <vnet/vnet.h>
+
+void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length);
+
+typedef void (*dpdk_worker_thread_callback_t) (vlib_main_t *vm);
+
+void dpdk_worker_thread (vlib_worker_thread_t * w,
+ char *io_name,
+ dpdk_worker_thread_callback_t callback);
+
+int dpdk_frame_queue_dequeue (vlib_main_t *vm);
+
+#endif /* __included_dpdk_threads_h__ */
diff --git a/vnet/vnet/devices/dpdk/vhost_user.c b/vnet/vnet/devices/dpdk/vhost_user.c
new file mode 100644
index 00000000000..5ab4c22ed3e
--- /dev/null
+++ b/vnet/vnet/devices/dpdk/vhost_user.c
@@ -0,0 +1,1550 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <vnet/devices/virtio/vhost-user.h>
+
+#define VHOST_USER_DEBUG_SOCKET 0
+
+#if VHOST_USER_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+/*
+ * DPDK vhost-user functions
+ */
+
+/* portions taken from dpdk
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
+{
+ struct virtio_memory_regions *region;
+ uint64_t vhost_va = 0;
+ uint32_t regionidx = 0;
+
+ /* Find the region where the address lives. */
+ for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+ region = &dev->mem->regions[regionidx];
+ if ((qemu_va >= region->userspace_address) &&
+ (qemu_va <= region->userspace_address +
+ region->memory_size)) {
+ vhost_va = qemu_va + region->guest_phys_address +
+ region->address_offset -
+ region->userspace_address;
+ break;
+ }
+ }
+ return vhost_va;
+}
+
+static dpdk_device_t *
+dpdk_vhost_user_device_from_hw_if_index(u32 hw_if_index)
+{
+ vnet_main_t *vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ dpdk_device_t * xd = vec_elt_at_index (dm->devices, hi->dev_instance);
+
+ if (xd->dev_type != VNET_DPDK_DEV_VHOST_USER)
+ return 0;
+
+ return xd;
+}
+
+static dpdk_device_t *
+dpdk_vhost_user_device_from_sw_if_index(u32 sw_if_index)
+{
+ vnet_main_t *vnm = vnet_get_main();
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index);
+ ASSERT (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+
+ return dpdk_vhost_user_device_from_hw_if_index(sw->hw_if_index);
+}
+
+static inline void * map_guest_mem(dpdk_device_t * xd, u64 addr)
+{
+ dpdk_vu_intf_t * vui = xd->vu_intf;
+ struct virtio_memory * mem = xd->vu_vhost_dev.mem;
+ int i;
+ for (i=0; i<mem->nregions; i++) {
+ if ((mem->regions[i].guest_phys_address <= addr) &&
+ ((mem->regions[i].guest_phys_address + mem->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_addr[i] + addr - mem->regions[i].guest_phys_address);
+ }
+ }
+ DBG_SOCK("failed to map guest mem addr %llx", addr);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_create_vhost_user_if_internal (u32 * hw_if_index, u32 if_id)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_main_t * vm = vlib_get_main();
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ vnet_sw_interface_t * sw;
+ clib_error_t * error;
+ dpdk_device_and_queue_t * dq;
+
+ dpdk_device_t * xd = NULL;
+ u8 addr[6];
+ int j;
+
+ vlib_worker_thread_barrier_sync (vm);
+
+ int inactive_cnt = vec_len(dm->vu_inactive_interfaces_device_index);
+ // if there are any inactive ifaces
+ if (inactive_cnt > 0) {
+ // take last
+ u32 vui_idx = dm->vu_inactive_interfaces_device_index[inactive_cnt - 1];
+ if (vec_len(dm->devices) > vui_idx) {
+ xd = vec_elt_at_index (dm->devices, vui_idx);
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER) {
+ DBG_SOCK("reusing inactive vhost-user interface sw_if_index %d", xd->vlib_sw_if_index);
+ } else {
+ clib_warning("error: inactive vhost-user interface sw_if_index %d not VHOST_USER type!",
+ xd->vlib_sw_if_index);
+ // reset so new interface is created
+ xd = NULL;
+ }
+ }
+ // "remove" from inactive list
+ _vec_len(dm->vu_inactive_interfaces_device_index) -= 1;
+ }
+
+ if (xd) {
+ // existing interface used - do not overwrite if_id if not needed
+ if (if_id != (u32)~0)
+ xd->vu_if_id = if_id;
+
+ // reset virtqueues
+ for (j = 0; j < VIRTIO_QNUM; j++)
+ {
+ memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue));
+ }
+ // reset lockp
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+
+ // reset tx vectors
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ // reset rx vector
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+ } else {
+ // vui was not retrieved from inactive ifaces - create new
+ vec_add2_aligned (dm->devices, xd, 1, CLIB_CACHE_LINE_BYTES);
+ xd->dev_type = VNET_DPDK_DEV_VHOST_USER;
+ xd->rx_q_used = 1;
+ vec_validate_aligned (xd->rx_vectors, xd->rx_q_used, CLIB_CACHE_LINE_BYTES);
+
+ if (if_id == (u32)~0)
+ xd->vu_if_id = dm->next_vu_if_id++;
+ else
+ xd->vu_if_id = if_id;
+
+ xd->device_index = xd - dm->devices;
+ xd->per_interface_next_index = ~0;
+ xd->vu_intf = NULL;
+
+ xd->vu_vhost_dev.mem = clib_mem_alloc (sizeof(struct virtio_memory) +
+ VHOST_MEMORY_MAX_NREGIONS *
+ sizeof(struct virtio_memory_regions));
+
+ for (j = 0; j < VIRTIO_QNUM; j++)
+ {
+ xd->vu_vhost_dev.virtqueue[j] = clib_mem_alloc (sizeof(struct vhost_virtqueue));
+ memset(xd->vu_vhost_dev.virtqueue[j], 0, sizeof(struct vhost_virtqueue));
+ }
+
+ xd->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) xd->lockp, 0, CLIB_CACHE_LINE_BYTES);
+
+ vec_validate_aligned (xd->tx_vectors, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ for (j = 0; j < tm->n_vlib_mains; j++)
+ {
+ vec_validate_ha (xd->tx_vectors[j], DPDK_TX_RING_SIZE,
+ sizeof(tx_ring_hdr_t), CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->tx_vectors[j]);
+ }
+
+ // reset rx vector
+ for (j = 0; j < xd->rx_q_used; j++)
+ {
+ vec_validate_aligned (xd->rx_vectors[j], VLIB_FRAME_SIZE-1,
+ CLIB_CACHE_LINE_BYTES);
+ vec_reset_length (xd->rx_vectors[j]);
+ }
+
+ vec_validate_aligned (xd->frames, tm->n_vlib_mains,
+ CLIB_CACHE_LINE_BYTES);
+
+ }
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (addr+2, &rnd, sizeof(rnd));
+ addr[0] = 2;
+ addr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (dm->vnet_main,
+ dpdk_device_class.index,
+ xd->device_index,
+ /* ethernet address */ addr,
+ &xd->vlib_hw_if_index,
+ 0);
+
+ if (error)
+ return error;
+
+ sw = vnet_get_hw_sw_interface (dm->vnet_main, xd->vlib_hw_if_index);
+ xd->vlib_sw_if_index = sw->sw_if_index;
+
+ if (!xd->vu_intf)
+ xd->vu_intf = clib_mem_alloc (sizeof(*(xd->vu_intf)));
+
+ *hw_if_index = xd->vlib_hw_if_index;
+
+ int cpu = (xd->device_index % dm->input_cpu_count) +
+ dm->input_cpu_first_index;
+
+ vec_add2(dm->devices_by_cpu[cpu], dq, 1);
+ dq->device = xd->device_index;
+ dq->queue_id = 0;
+
+ // start polling if it was not started yet (because of no phys ifaces)
+ if (tm->n_vlib_mains == 1 && dpdk_input_node.state != VLIB_NODE_STATE_POLLING)
+ vlib_node_set_state (vm, dpdk_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ if (tm->n_vlib_mains > 1 && tm->main_thread_is_io_node)
+ vlib_node_set_state (vm, dpdk_io_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ if (tm->n_vlib_mains > 1 && !tm->main_thread_is_io_node)
+ vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ vlib_worker_thread_barrier_release (vm);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_get_features(u32 hw_if_index, u64 * features)
+{
+ *features = rte_vhost_feature_get();
+
+ DBG_SOCK("supported features: 0x%x", *features);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_features(u32 hw_if_index, u64 features)
+{
+ dpdk_device_t * xd;
+ u16 hdr_len = sizeof(struct virtio_net_hdr);
+
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ xd->vu_vhost_dev.features = features;
+
+ if (xd->vu_vhost_dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+ hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+ xd->vu_vhost_dev.virtqueue[VIRTIO_RXQ]->vhost_hlen = hdr_len;
+ xd->vu_vhost_dev.virtqueue[VIRTIO_TXQ]->vhost_hlen = hdr_len;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_mem_table(u32 hw_if_index, vhost_user_memory_t * vum, int fd[])
+{
+ struct virtio_memory * mem;
+ int i;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vui = xd->vu_intf;
+ mem = xd->vu_vhost_dev.mem;
+
+ mem->nregions = vum->nregions;
+
+ for (i=0; i < mem->nregions; i++) {
+ u64 mapped_size, mapped_address;
+
+ mem->regions[i].guest_phys_address = vum->regions[i].guest_phys_addr;
+ mem->regions[i].guest_phys_address_end = vum->regions[i].guest_phys_addr +
+ vum->regions[i].memory_size;
+ mem->regions[i].memory_size = vum->regions[i].memory_size;
+ mem->regions[i].userspace_address = vum->regions[i].userspace_addr;
+
+ mapped_size = mem->regions[i].memory_size + vum->regions[i].mmap_offset;
+ mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mapped_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd[i], 0);
+
+ if ((void *)mapped_address == MAP_FAILED)
+ {
+ clib_warning("mmap error");
+ return 0;
+ }
+
+ mapped_address += vum->regions[i].mmap_offset;
+ vui->region_addr[i] = mapped_address;
+ vui->region_fd[i] = fd[i];
+ mem->regions[i].address_offset = mapped_address - mem->regions[i].guest_phys_address;
+
+ if (vum->regions[i].guest_phys_addr == 0) {
+ mem->base_address = vum->regions[i].userspace_addr;
+ mem->mapped_address = mem->regions[i].address_offset;
+ }
+ }
+
+ xd->vu_is_running = 0;
+
+ DBG_SOCK("done");
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_num(u32 hw_if_index, u8 idx, u32 num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u num %u", idx, num);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->size = num;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_addr(u32 hw_if_index, u8 idx, u64 desc, u64 used, u64 avail)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u desc 0x%x used 0x%x avail 0x%x", idx, desc, used, avail);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ vq->desc = (struct vring_desc *) qva_to_vva(&xd->vu_vhost_dev, desc);
+ vq->used = (struct vring_used *) qva_to_vva(&xd->vu_vhost_dev, used);
+ vq->avail = (struct vring_avail *) qva_to_vva(&xd->vu_vhost_dev, avail);
+
+ if (!(vq->desc && vq->used && vq->avail)) {
+ clib_warning("falied to set vring addr");
+ }
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_get_vring_base(u32 hw_if_index, u8 idx, u32 * num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ *num = vq->last_used_idx;
+
+ DBG_SOCK("idx %u num %u", idx, *num);
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_base(u32 hw_if_index, u8 idx, u32 num)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u num %u", idx, num);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->last_used_idx = num;
+ vq->last_used_idx_res = num;
+
+ xd->vu_is_running = 0;
+
+ return 0;
+}
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_kick(u32 hw_if_index, u8 idx, int fd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq, *vq0, *vq1;
+
+ DBG_SOCK("idx %u fd %d", idx, fd);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ vq->kickfd = fd;
+
+ vq0 = xd->vu_vhost_dev.virtqueue[0];
+ vq1 = xd->vu_vhost_dev.virtqueue[1];
+
+ if (vq0->desc && vq0->avail && vq0->used &&
+ vq1->desc && vq1->avail && vq1->used) {
+ xd->vu_is_running = 1;
+ if (xd->admin_up)
+ vnet_hw_interface_set_flags (dm->vnet_main, xd->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP |
+ ETH_LINK_FULL_DUPLEX );
+ }
+
+ return 0;
+}
+
+
+static clib_error_t *
+dpdk_vhost_user_set_vring_call(u32 hw_if_index, u8 idx, int fd)
+{
+ dpdk_device_t * xd;
+ struct vhost_virtqueue *vq;
+
+ DBG_SOCK("idx %u fd %d", idx, fd);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_index))) {
+ clib_warning("not a vhost-user interface");
+ return 0;
+ }
+
+ vq = xd->vu_vhost_dev.virtqueue[idx];
+ /* reset callfd to force no interrupts */
+ vq->callfd = -1;
+
+ return 0;
+}
+
+u8
+dpdk_vhost_user_want_interrupt(dpdk_device_t *xd, int idx)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+
+ if (PREDICT_FALSE(vui->num_vrings <= 0))
+ return 0;
+
+ dpdk_vu_vring *vring = &(vui->vrings[idx]);
+ struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ /* return if vm is interested in interrupts */
+ return (vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+void
+dpdk_vhost_user_send_interrupt(vlib_main_t * vm, dpdk_device_t * xd, int idx)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+
+ if (PREDICT_FALSE(vui->num_vrings <= 0))
+ return;
+
+ dpdk_vu_vring *vring = &(vui->vrings[idx]);
+ struct vhost_virtqueue *vq = xd->vu_vhost_dev.virtqueue[idx];
+
+ /* if vm is interested in interrupts */
+ if((vring->callfd > 0) && !(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+ u64 x = 1;
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = write(vring->callfd, &x, sizeof(x));
+ vring->n_since_last_int = 0;
+ vring->int_deadline = vlib_time_now(vm) + dm->vhost_coalesce_time;
+ }
+}
+
+/*
+ * vhost-user interface management functions
+ */
+
+// initialize vui with specified attributes
+static void
+dpdk_vhost_user_vui_init(vnet_main_t * vnm,
+ dpdk_device_t *xd, int sockfd,
+ const char * sock_filename,
+ u8 is_server, u64 feature_mask,
+ u32 * sw_if_index)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ memset(vui, 0, sizeof(*vui));
+
+ vui->unix_fd = sockfd;
+ vui->num_vrings = 2;
+ vui->sock_is_server = is_server;
+ strncpy(vui->sock_filename, sock_filename, ARRAY_LEN(vui->sock_filename)-1);
+ vui->sock_errno = 0;
+ vui->is_up = 0;
+ vui->feature_mask = feature_mask;
+ vui->active = 1;
+ vui->unix_file_index = ~0;
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = xd->vlib_sw_if_index;
+}
+
+// register vui and start polling on it
+static void
+dpdk_vhost_user_vui_register(vlib_main_t * vm, dpdk_device_t *xd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+
+ hash_set (dm->vu_sw_if_index_by_listener_fd, vui->unix_fd,
+ xd->vlib_sw_if_index);
+}
+
+static inline void
+dpdk_vhost_user_if_disconnect(dpdk_device_t * xd)
+{
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_main_t * dm = &dpdk_main;
+
+ xd->admin_up = 0;
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, 0);
+
+ if (vui->unix_file_index != ~0) {
+ unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index);
+ vui->unix_file_index = ~0;
+ }
+
+ hash_unset(dm->vu_sw_if_index_by_sock_fd, vui->unix_fd);
+ hash_unset(dm->vu_sw_if_index_by_listener_fd, vui->unix_fd);
+ close(vui->unix_fd);
+ vui->unix_fd = -1;
+ vui->is_up = 0;
+
+ DBG_SOCK("interface ifindex %d disconnected", xd->vlib_sw_if_index);
+}
+
+static clib_error_t * dpdk_vhost_user_callfd_read_ready (unix_file_t * uf)
+{
+ __attribute__((unused)) int n;
+ u8 buff[8];
+ n = read(uf->file_descriptor, ((char*)&buff), 8);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socket_read (unix_file_t * uf)
+{
+ int n;
+ int fd, number_of_fds = 0;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_msg_t msg;
+ struct msghdr mh;
+ struct iovec iov[1];
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ dpdk_vu_intf_t *vui;
+ struct cmsghdr *cmsg;
+ uword * p;
+ u8 q;
+ unix_file_t template = {0};
+ vnet_main_t * vnm = vnet_get_main();
+
+ p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+
+ ASSERT(xd != NULL);
+ vui = xd->vu_intf;
+
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))];
+
+ memset(&mh, 0, sizeof(mh));
+ memset(control, 0, sizeof(control));
+
+ /* set the payload */
+ iov[0].iov_base = (void *) &msg;
+ iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
+
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = control;
+ mh.msg_controllen = sizeof(control);
+
+ n = recvmsg(uf->file_descriptor, &mh, 0);
+
+ if (n != VHOST_USER_MSG_HDR_SZ)
+ goto close_socket;
+
+ if (mh.msg_flags & MSG_CTRUNC) {
+ goto close_socket;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&mh);
+
+ if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS) &&
+ (cmsg->cmsg_len - CMSG_LEN(0) <= VHOST_MEMORY_MAX_NREGIONS * sizeof(int))) {
+ number_of_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ memcpy(fds, CMSG_DATA(cmsg), number_of_fds * sizeof(int));
+ }
+
+ /* version 1, no reply bit set*/
+ if ((msg.flags & 7) != 1) {
+ DBG_SOCK("malformed message received. closing socket");
+ goto close_socket;
+ }
+
+ {
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = read(uf->file_descriptor, ((char*)&msg) + n, msg.size);
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_GET_FEATURES",
+ xd->vlib_hw_if_index);
+
+ msg.flags |= 4;
+
+ dpdk_vhost_user_get_features(xd->vlib_hw_if_index, &msg.u64);
+ msg.u64 &= vui->feature_mask;
+ msg.size = sizeof(msg.u64);
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
+ xd->vlib_hw_if_index, msg.u64);
+
+ dpdk_vhost_user_set_features(xd->vlib_hw_if_index, msg.u64);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
+ xd->vlib_hw_if_index, msg.memory.nregions);
+
+ if ((msg.memory.nregions < 1) ||
+ (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) {
+
+ DBG_SOCK("number of mem regions must be between 1 and %i",
+ VHOST_MEMORY_MAX_NREGIONS);
+
+ goto close_socket;
+ }
+
+ if (msg.memory.nregions != number_of_fds) {
+ DBG_SOCK("each memory region must have FD");
+ goto close_socket;
+ }
+
+ dpdk_vhost_user_set_mem_table(xd->vlib_hw_if_index, &msg.memory, fds);
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
+ (msg.state.num == 0) || /* it cannot be zero */
+ (msg.state.num % 2)) /* must be power of 2 */
+ goto close_socket;
+
+ dpdk_vhost_user_set_vring_num(xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
+ xd->vlib_hw_if_index, msg.state.index);
+
+ dpdk_vhost_user_set_vring_addr(xd->vlib_hw_if_index, msg.state.index,
+ msg.addr.desc_user_addr,
+ msg.addr.used_user_addr,
+ msg.addr.avail_user_addr);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_SET_OWNER",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_RESET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_RESET_OWNER",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_VRING_CALL:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ /* if there is old fd, delete it */
+ if (vui->vrings[q].callfd) {
+ unix_file_t * uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].callfd_idx);
+ unix_file_del (&unix_main, uf);
+ }
+ vui->vrings[q].callfd = fds[0];
+ template.read_function = dpdk_vhost_user_callfd_read_ready;
+ template.file_descriptor = fds[0];
+ vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ vui->vrings[q].callfd = -1;
+
+ dpdk_vhost_user_set_vring_call(xd->vlib_hw_if_index, q, vui->vrings[q].callfd);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ vui->vrings[q].kickfd = fds[0];
+ }
+ else
+ vui->vrings[q].kickfd = -1;
+
+ dpdk_vhost_user_set_vring_kick(xd->vlib_hw_if_index, q, vui->vrings[q].kickfd);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
+ xd->vlib_hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ fd = fds[0];
+ }
+ else
+ fd = -1;
+
+ vui->vrings[q].errfd = fd;
+ break;
+
+ case VHOST_USER_SET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ dpdk_vhost_user_set_vring_base(xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ xd->vlib_hw_if_index, msg.state.index, msg.state.num);
+
+ msg.flags |= 4;
+ msg.size = sizeof(msg.state);
+
+ dpdk_vhost_user_get_vring_base(xd->vlib_hw_if_index, msg.state.index, &msg.state.num);
+ break;
+
+ case VHOST_USER_NONE:
+ DBG_SOCK("if %d msg VHOST_USER_NONE",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE",
+ xd->vlib_hw_if_index);
+ break;
+
+ case VHOST_USER_SET_LOG_FD:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD",
+ xd->vlib_hw_if_index);
+ break;
+
+ default:
+ DBG_SOCK("unknown vhost-user message %d received. closing socket",
+ msg.request);
+ goto close_socket;
+ }
+
+ /* if we have pointers to descriptor table, go up*/
+ if (!vui->is_up &&
+ xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_TX]->desc &&
+ xd->vu_vhost_dev.virtqueue[VHOST_NET_VRING_IDX_RX]->desc) {
+
+ DBG_SOCK("interface %d connected", xd->vlib_sw_if_index);
+
+ vnet_hw_interface_set_flags (vnm, xd->vlib_hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP);
+ vui->is_up = 1;
+ }
+
+ /* if we need to reply */
+ if (msg.flags & 4)
+ {
+ n = send(uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
+ if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
+ goto close_socket;
+ }
+
+ return 0;
+
+close_socket:
+ DBG_SOCK("error: close_socket");
+ dpdk_vhost_user_if_disconnect(xd);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socket_error (unix_file_t * uf)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ uword * p;
+
+ p = hash_get (dm->vu_sw_if_index_by_sock_fd, uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+
+ dpdk_vhost_user_if_disconnect(xd);
+ return 0;
+}
+
+static clib_error_t * dpdk_vhost_user_socksvr_accept_ready (unix_file_t * uf)
+{
+ int client_fd, client_len;
+ struct sockaddr_un client;
+ unix_file_t template = {0};
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = NULL;
+ dpdk_vu_intf_t * vui;
+ uword * p;
+
+ p = hash_get (dm->vu_sw_if_index_by_listener_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(p[0]);
+ ASSERT(xd != NULL);
+ vui = xd->vu_intf;
+
+ client_len = sizeof(client);
+ client_fd = accept (uf->file_descriptor,
+ (struct sockaddr *)&client,
+ (socklen_t *)&client_len);
+
+ if (client_fd < 0)
+ return clib_error_return_unix (0, "accept");
+
+ template.read_function = dpdk_vhost_user_socket_read;
+ template.error_function = dpdk_vhost_user_socket_error;
+ template.file_descriptor = client_fd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+
+ vui->client_fd = client_fd;
+ hash_set (dm->vu_sw_if_index_by_sock_fd, vui->client_fd,
+ xd->vlib_sw_if_index);
+
+ return 0;
+}
+
+// init server socket on specified sock_filename
+static int dpdk_vhost_user_init_server_sock(const char * sock_filename, int *sockfd)
+{
+ int rv = 0, len;
+ struct sockaddr_un un;
+ int fd;
+ /* create listening socket */
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+
+ if (fd < 0) {
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+
+ un.sun_family = AF_UNIX;
+ strcpy((char *) un.sun_path, (char *) sock_filename);
+
+ /* remove if exists */
+ unlink( (char *) sock_filename);
+
+ len = strlen((char *) un.sun_path) + strlen((char *) sock_filename);
+
+ if (bind(fd, (struct sockaddr *) &un, len) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ if (listen(fd, 1) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ unix_file_t template = {0};
+ template.read_function = dpdk_vhost_user_socksvr_accept_ready;
+ template.file_descriptor = fd;
+ unix_file_add (&unix_main, &template);
+ *sockfd = fd;
+ return rv;
+
+error:
+ close(fd);
+ return rv;
+}
+
+/*
+ * vhost-user interface control functions used from vpe api
+ */
+
+int dpdk_vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t *xd;
+ u32 hw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_create_if(vnm, vm, sock_filename, is_server,
+ sw_if_index, feature_mask, renumber, custom_dev_instance);
+ }
+
+ if (is_server) {
+ if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ if (renumber) {
+ // set next vhost-user if id if custom one is higher or equal
+ if (custom_dev_instance >= dm->next_vu_if_id)
+ dm->next_vu_if_id = custom_dev_instance + 1;
+
+ dpdk_create_vhost_user_if_internal(&hw_if_idx, custom_dev_instance);
+ } else
+ dpdk_create_vhost_user_if_internal(&hw_if_idx, (u32)~0);
+ DBG_SOCK("dpdk vhost-user interface created hw_if_index %d", hw_if_idx);
+
+ xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_idx);
+ ASSERT(xd != NULL);
+
+ dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server,
+ feature_mask, sw_if_index);
+
+ dpdk_vhost_user_vui_register (vm, xd);
+ return rv;
+}
+
+int dpdk_vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui = NULL;
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_modify_if(vnm, vm, sock_filename, is_server,
+ sw_if_index, feature_mask, renumber, custom_dev_instance);
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index);
+
+ if (xd == NULL)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ vui = xd->vu_intf;
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ dpdk_vhost_user_if_disconnect(xd);
+
+ if (is_server) {
+ if ((rv = dpdk_vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ dpdk_vhost_user_vui_init (vnm, xd, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ dpdk_vhost_user_vui_register (vm, xd);
+
+ return rv;
+}
+
+int dpdk_vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd = NULL;
+ dpdk_vu_intf_t * vui;
+ int rv = 0;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_delete_if(vnm, vm, sw_if_index);
+ }
+
+ xd = dpdk_vhost_user_device_from_sw_if_index(sw_if_index);
+
+ if (xd == NULL)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ vui = xd->vu_intf;
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ dpdk_vhost_user_if_disconnect(xd);
+ // add to inactive interface list
+ vec_add1 (dm->vu_inactive_interfaces_device_index, xd->device_index);
+
+ ethernet_delete_interface (vnm, xd->vlib_hw_if_index);
+ DBG_SOCK ("deleted (deactivated) vhost-user interface sw_if_index %d", sw_if_index);
+
+ return rv;
+}
+
+int dpdk_vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t **out_vuids)
+{
+ int rv = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+ struct virtio_net * vhost_dev;
+ vhost_user_intf_details_t * r_vuids = NULL;
+ vhost_user_intf_details_t * vuid = NULL;
+ u32 * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ u8 *s = NULL;
+ int i;
+
+ if (!out_vuids)
+ return -1;
+
+ // using virtio vhost user?
+ if (dm->use_virtio_vhost) {
+ return vhost_user_dump_ifs(vnm, vm, out_vuids);
+ }
+
+ vec_foreach (xd, dm->devices) {
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER &&
+ xd->vu_intf->active)
+ vec_add1(hw_if_indices, xd->vlib_hw_if_index);
+ }
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]);
+ if (!xd) {
+ clib_warning("invalid vhost-user interface hw_if_index %d", hw_if_indices[i]);
+ continue;
+ }
+
+ vui = xd->vu_intf;
+ ASSERT(vui != NULL);
+ vhost_dev = &xd->vu_vhost_dev;
+ u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ?
+ vhost_dev->virtqueue[0]->vhost_hlen : 0);
+
+ vec_add2(r_vuids, vuid, 1);
+ vuid->sw_if_index = xd->vlib_sw_if_index;
+ vuid->virtio_net_hdr_sz = virtio_net_hdr_sz;
+ vuid->features = vhost_dev->features;
+ vuid->is_server = vui->sock_is_server;
+ vuid->num_regions = (vhost_dev->mem != NULL ? vhost_dev->mem->nregions : 0);
+ vuid->sock_errno = vui->sock_errno;
+ strncpy((char *)vuid->sock_filename, (char *)vui->sock_filename,
+ ARRAY_LEN(vuid->sock_filename)-1);
+
+ s = format (s, "%v%c", hi->name, 0);
+
+ strncpy((char *)vuid->if_name, (char *)s,
+ ARRAY_LEN(vuid->if_name)-1);
+ _vec_len(s) = 0;
+ }
+
+ vec_free (s);
+ vec_free (hw_if_indices);
+
+ *out_vuids = r_vuids;
+
+ return rv;
+}
+
+/*
+ * Processing functions called from dpdk process fn
+ */
+
+typedef struct {
+ struct sockaddr_un sun;
+ int sockfd;
+ unix_file_t template;
+ uword *event_data;
+} dpdk_vu_process_state;
+
+void dpdk_vhost_user_process_init (void **ctx)
+{
+ dpdk_vu_process_state *state = clib_mem_alloc (sizeof(dpdk_vu_process_state));
+ memset(state, 0, sizeof(*state));
+ state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ state->sun.sun_family = AF_UNIX;
+ state->template.read_function = dpdk_vhost_user_socket_read;
+ state->template.error_function = dpdk_vhost_user_socket_error;
+ state->event_data = 0;
+ *ctx = state;
+}
+
+void dpdk_vhost_user_process_cleanup (void *ctx)
+{
+ clib_mem_free(ctx);
+}
+
+uword dpdk_vhost_user_process_if (vlib_main_t *vm, dpdk_device_t *xd, void *ctx)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ dpdk_vu_process_state *state = (dpdk_vu_process_state *)ctx;
+ dpdk_vu_intf_t *vui = xd->vu_intf;
+
+ if (vui->sock_is_server || !vui->active)
+ return 0;
+
+ if (vui->unix_fd == -1) {
+ /* try to connect */
+ strncpy(state->sun.sun_path, (char *) vui->sock_filename, sizeof(state->sun.sun_path) - 1);
+
+ if (connect(state->sockfd, (struct sockaddr *) &(state->sun), sizeof(struct sockaddr_un)) == 0) {
+ vui->sock_errno = 0;
+ vui->unix_fd = state->sockfd;
+ state->template.file_descriptor = state->sockfd;
+ vui->unix_file_index = unix_file_add (&unix_main, &(state->template));
+ hash_set (dm->vu_sw_if_index_by_sock_fd, state->sockfd, xd->vlib_sw_if_index);
+
+ state->sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (state->sockfd < 0)
+ return -1;
+ } else {
+ vui->sock_errno = errno;
+ }
+ } else {
+ /* check if socket is alive */
+ int error = 0;
+ socklen_t len = sizeof (error);
+ int retval = getsockopt(vui->unix_fd, SOL_SOCKET, SO_ERROR, &error, &len);
+
+ if (retval)
+ dpdk_vhost_user_if_disconnect(xd);
+ }
+ return 0;
+}
+
+/*
+ * CLI functions
+ */
+
+static clib_error_t *
+dpdk_vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * sock_filename = NULL;
+ u32 sw_if_index;
+ u8 is_server = 0;
+ u64 feature_mask = (u64)~0;
+ u8 renumber = 0;
+ u32 custom_dev_instance = ~0;
+
+ if (dm->use_virtio_vhost) {
+ return vhost_user_connect_command_fn(vm, input, cmd);
+ }
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "socket %s", &sock_filename))
+ ;
+ else if (unformat (line_input, "server"))
+ is_server = 1;
+ else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
+ ;
+ else if (unformat (line_input, "renumber %d", &custom_dev_instance)) {
+ renumber = 1;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+ if (sock_filename == NULL)
+ return clib_error_return (0, "missing socket file");
+
+ dpdk_vhost_user_create_if(vnm, vm, (char *)sock_filename,
+ is_server, &sw_if_index, feature_mask,
+ renumber, custom_dev_instance);
+
+ vec_free(sock_filename);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dpdk_vhost_user_connect_command, static) = {
+ .path = "create vhost-user",
+ .short_help = "create vhost-user socket <socket-filename> [server] [feature-mask <hex>] [renumber <dev_instance>]",
+ .function = dpdk_vhost_user_connect_command_fn,
+};
+
+static clib_error_t *
+dpdk_vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ clib_error_t * error = 0;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 sw_if_index = ~0;
+
+ if (dm->use_virtio_vhost) {
+ return vhost_user_delete_command_fn(vm, input, cmd);
+ }
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0) {
+ error = clib_error_return (0, "invalid sw_if_index",
+ format_unformat_error, input);
+ return error;
+ }
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ dpdk_vhost_user_delete_if(vnm, vm, sw_if_index);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dpdk_vhost_user_delete_command, static) = {
+ .path = "delete vhost-user",
+ .short_help = "delete vhost-user sw_if_index <nn>",
+ .function = dpdk_vhost_user_delete_command_fn,
+};
+
+#define foreach_dpdk_vhost_feature \
+ _ (VIRTIO_NET_F_MRG_RXBUF) \
+ _ (VIRTIO_NET_F_CTRL_VQ) \
+ _ (VIRTIO_NET_F_CTRL_RX)
+
+static clib_error_t *
+show_dpdk_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ dpdk_main_t * dm = &dpdk_main;
+ vnet_main_t * vnm = vnet_get_main();
+ dpdk_device_t * xd;
+ dpdk_vu_intf_t * vui;
+ struct virtio_net * vhost_dev;
+ u32 hw_if_index, * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ int i, j, q;
+ int show_descr = 0;
+ struct virtio_memory * mem;
+ struct feat_struct { u8 bit; char *str;};
+ struct feat_struct *feat_entry;
+
+ static struct feat_struct feat_array[] = {
+#define _(f) { .str = #f, .bit = f, },
+ foreach_dpdk_vhost_feature
+#undef _
+ { .str = NULL }
+ };
+
+ if (dm->use_virtio_vhost) {
+ return show_vhost_user_command_fn(vm, input, cmd);
+ }
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) {
+ vec_add1 (hw_if_indices, hw_if_index);
+ vlib_cli_output(vm, "add %d", hw_if_index);
+ }
+ else if (unformat (input, "descriptors") || unformat (input, "desc") )
+ show_descr = 1;
+ else {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+ if (vec_len (hw_if_indices) == 0) {
+ vec_foreach (xd, dm->devices) {
+ if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER && xd->vu_intf->active)
+ vec_add1(hw_if_indices, xd->vlib_hw_if_index);
+ }
+ }
+
+ vlib_cli_output (vm, "DPDK vhost-user interfaces");
+ vlib_cli_output (vm, "Global:\n coalesce frames %d time %e\n\n",
+ dm->vhost_coalesce_frames, dm->vhost_coalesce_time);
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+
+ if (!(xd = dpdk_vhost_user_device_from_hw_if_index(hw_if_indices[i]))) {
+ error = clib_error_return (0, "not dpdk vhost-user interface: '%s'",
+ hi->name);
+ goto done;
+ }
+ vui = xd->vu_intf;
+ vhost_dev = &xd->vu_vhost_dev;
+ mem = vhost_dev->mem;
+ u32 virtio_net_hdr_sz = (vui->num_vrings > 0 ?
+ vhost_dev->virtqueue[0]->vhost_hlen : 0);
+
+ vlib_cli_output (vm, "Interface: %s (ifindex %d)",
+ hi->name, hw_if_indices[i]);
+
+ vlib_cli_output (vm, "virtio_net_hdr_sz %d\n features (0x%llx): \n",
+ virtio_net_hdr_sz, xd->vu_vhost_dev.features);
+
+ feat_entry = (struct feat_struct *) &feat_array;
+ while(feat_entry->str) {
+ if (xd->vu_vhost_dev.features & (1 << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str, feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, "\n");
+
+ vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
+ vui->sock_filename, vui->sock_is_server ? "server" : "client",
+ strerror(vui->sock_errno));
+
+ vlib_cli_output (vm, " Memory regions (total %d)\n", mem->nregions);
+
+ if (mem->nregions){
+ vlib_cli_output(vm, " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
+ vlib_cli_output(vm, " ====== ===== ================== ================== ================== ================== ==================\n");
+ }
+ for (j = 0; j < mem->nregions; j++) {
+ vlib_cli_output(vm, " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", j,
+ vui->region_fd[j],
+ mem->regions[j].guest_phys_address,
+ mem->regions[j].memory_size,
+ mem->regions[j].userspace_address,
+ mem->regions[j].address_offset,
+ vui->region_addr[j]);
+ }
+ for (q = 0; q < vui->num_vrings; q++) {
+ struct vhost_virtqueue *vq = vhost_dev->virtqueue[q];
+
+ vlib_cli_output(vm, "\n Virtqueue %d\n", q);
+
+ vlib_cli_output(vm, " qsz %d last_used_idx %d last_used_idx_res %d\n",
+ vq->size, vq->last_used_idx, vq->last_used_idx_res);
+
+ if (vq->avail && vq->used)
+ vlib_cli_output(vm, " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vq->avail->flags, vq->avail->idx, vq->used->flags, vq->used->idx);
+
+ vlib_cli_output(vm, " kickfd %d callfd %d errfd %d\n",
+ vui->vrings[q].kickfd,
+ vui->vrings[q].callfd,
+ vui->vrings[q].errfd);
+
+ if (show_descr) {
+ vlib_cli_output(vm, "\n descriptor table:\n");
+ vlib_cli_output(vm, " id addr len flags next user_addr\n");
+ vlib_cli_output(vm, " ===== ================== ===== ====== ===== ==================\n");
+ for(j = 0; j < vq->size; j++) {
+ vlib_cli_output(vm, " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
+ j,
+ vq->desc[j].addr,
+ vq->desc[j].len,
+ vq->desc[j].flags,
+ vq->desc[j].next,
+ (u64) map_guest_mem(xd, vq->desc[j].addr));}
+ }
+ }
+ vlib_cli_output (vm, "\n");
+ }
+done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_vhost_user_command, static) = {
+ .path = "show vhost-user",
+ .short_help = "show vhost-user interface",
+ .function = show_dpdk_vhost_user_command_fn,
+};
+
diff --git a/vnet/vnet/devices/ssvm/node.c b/vnet/vnet/devices/ssvm/node.c
new file mode 100644
index 00000000000..fe53d1199a2
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/node.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+typedef struct {
+ u32 next_index;
+ u32 sw_if_index;
+} ssvm_eth_input_trace_t;
+
+/* packet trace format function */
+static u8 * format_ssvm_eth_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ssvm_eth_input_trace_t * t = va_arg (*args, ssvm_eth_input_trace_t *);
+
+ s = format (s, "SSVM_ETH_INPUT: sw_if_index %d, next index %d",
+ t->sw_if_index, t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ssvm_eth_input_node;
+
+#define foreach_ssvm_eth_input_error \
+_(NO_BUFFERS, "Rx packet drops (no buffers)")
+
+typedef enum {
+#define _(sym,str) SSVM_ETH_INPUT_ERROR_##sym,
+ foreach_ssvm_eth_input_error
+#undef _
+ SSVM_ETH_INPUT_N_ERROR,
+} ssvm_eth_input_error_t;
+
+static char * ssvm_eth_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ssvm_eth_input_error
+#undef _
+};
+
+typedef enum {
+ SSVM_ETH_INPUT_NEXT_DROP,
+ SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP4_INPUT,
+ SSVM_ETH_INPUT_NEXT_IP6_INPUT,
+ SSVM_ETH_INPUT_NEXT_MPLS_INPUT,
+ SSVM_ETH_INPUT_N_NEXT,
+} ssvm_eth_input_next_t;
+
+static inline uword
+ssvm_eth_device_input (ssvm_eth_main_t * em,
+ ssvm_private_t * intfc,
+ vlib_node_runtime_t * node)
+{
+ ssvm_shared_header_t * sh = intfc->sh;
+ vlib_main_t * vm = em->vlib_main;
+ unix_shared_memory_queue_t * q;
+ ssvm_eth_queue_elt_t * elt, * elts;
+ u32 elt_index;
+ u32 my_pid = intfc->my_pid;
+ int rx_queue_index;
+ u32 n_to_alloc = VLIB_FRAME_SIZE * 2;
+ u32 n_allocated, n_present_in_cache;
+#if DPDK > 0
+ u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
+#else
+ u32 next_index = 0;
+#endif
+ vlib_buffer_free_list_t * fl;
+ u32 n_left_to_next, * to_next;
+ u32 next0;
+ u32 n_buffers;
+ u32 n_available;
+ u32 bi0, saved_bi0;
+ vlib_buffer_t * b0, * prev;
+ u32 saved_cache_size = 0;
+ ethernet_header_t * eh0;
+ u16 type0;
+ u32 n_rx_bytes = 0, l3_offset0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 trace_cnt __attribute__((unused)) = vlib_get_trace_count (vm, node);
+ volatile u32 * lock;
+ u32 * elt_indices;
+
+ /* Either side down? buh-bye... */
+ if ((u64)(sh->opaque [MASTER_ADMIN_STATE_INDEX]) == 0 ||
+ (u64)(sh->opaque [SLAVE_ADMIN_STATE_INDEX]) == 0)
+ return 0;
+
+ if (intfc->i_am_master)
+ q = (unix_shared_memory_queue_t *)(sh->opaque [TO_MASTER_Q_INDEX]);
+ else
+ q = (unix_shared_memory_queue_t *)(sh->opaque [TO_SLAVE_Q_INDEX]);
+
+ /* Nothing to do? */
+ if (q->cursize == 0)
+ return 0;
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ vec_reset_length (intfc->rx_queue);
+
+ lock = (u32 *) q;
+ while (__sync_lock_test_and_set (lock, 1))
+ ;
+ while (q->cursize > 0)
+ {
+ unix_shared_memory_queue_sub_raw (q, (u8 *)&elt_index);
+ ASSERT(elt_index < 2048);
+ vec_add1 (intfc->rx_queue, elt_index);
+ }
+ CLIB_MEMORY_BARRIER();
+ *lock = 0;
+
+ n_present_in_cache = vec_len (em->buffer_cache);
+
+ if (vec_len (em->buffer_cache) < vec_len (intfc->rx_queue) * 2)
+ {
+ vec_validate (em->buffer_cache,
+ n_to_alloc + vec_len (em->buffer_cache) - 1);
+ n_allocated =
+ vlib_buffer_alloc (vm, &em->buffer_cache [n_present_in_cache],
+ n_to_alloc);
+
+ n_present_in_cache += n_allocated;
+ _vec_len (em->buffer_cache) = n_present_in_cache;
+ }
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque [CHUNK_POOL_INDEX]);
+
+ n_buffers = vec_len (intfc->rx_queue);
+ rx_queue_index = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_buffers > 0 && n_left_to_next > 0)
+ {
+ elt = elts + intfc->rx_queue[rx_queue_index];
+
+ saved_cache_size = n_present_in_cache;
+ if (PREDICT_FALSE(saved_cache_size == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ goto out;
+ }
+ saved_bi0 = bi0 = em->buffer_cache [--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ prev = 0;
+
+ while (1)
+ {
+ vlib_buffer_init_for_free_list (b0, fl);
+ b0->clone_count = 0;
+
+ b0->current_data = elt->current_data_hint;
+ b0->current_length = elt->length_this_buffer;
+ b0->total_length_not_including_first_buffer =
+ elt->total_length_not_including_first_buffer;
+
+ memcpy (b0->data + b0->current_data, elt->data,
+ b0->current_length);
+
+ if (PREDICT_FALSE(prev != 0))
+ prev->next_buffer = bi0;
+
+ if (PREDICT_FALSE(elt->flags & SSVM_BUFFER_NEXT_PRESENT))
+ {
+ prev = b0;
+ if (PREDICT_FALSE(n_present_in_cache == 0))
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ goto out;
+ }
+ bi0 = em->buffer_cache [--n_present_in_cache];
+ b0 = vlib_get_buffer (vm, bi0);
+ }
+ else
+ break;
+ }
+
+ saved_cache_size = n_present_in_cache;
+
+ to_next[0] = saved_bi0;
+ to_next++;
+ n_left_to_next--;
+
+ b0 = vlib_get_buffer (vm, saved_bi0);
+ eh0 = vlib_buffer_get_current (b0);
+
+ type0 = clib_net_to_host_u16 (eh0->type);
+
+ next0 = SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT;
+
+ if (type0 == ETHERNET_TYPE_IP4)
+ next0 = SSVM_ETH_INPUT_NEXT_IP4_INPUT;
+ else if (type0 == ETHERNET_TYPE_IP6)
+ next0 = SSVM_ETH_INPUT_NEXT_IP6_INPUT;
+ else if (type0 == ETHERNET_TYPE_MPLS_UNICAST)
+ next0 = SSVM_ETH_INPUT_NEXT_MPLS_INPUT;
+
+ l3_offset0 = ((next0 == SSVM_ETH_INPUT_NEXT_IP4_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_IP6_INPUT ||
+ next0 == SSVM_ETH_INPUT_NEXT_MPLS_INPUT) ?
+ sizeof (ethernet_header_t) : 0);
+
+ n_rx_bytes += b0->current_length
+ + b0->total_length_not_including_first_buffer;
+
+ b0->current_data += l3_offset0;
+ b0->current_length -= l3_offset0;
+ b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = intfc->vlib_hw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See main.c...
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b0);
+
+ /* $$$$ tracing */
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ n_buffers--;
+ rx_queue_index++;
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ out:
+ if (em->buffer_cache)
+ _vec_len (em->buffer_cache) = saved_cache_size;
+ else
+ ASSERT (saved_cache_size == 0);
+
+ ssvm_lock (sh, my_pid, 2);
+
+ ASSERT(vec_len(intfc->rx_queue) > 0);
+
+ n_available = (u32)(u64)(sh->opaque[CHUNK_POOL_NFREE]);
+ elt_indices = (u32 *)(sh->opaque[CHUNK_POOL_FREELIST_INDEX]);
+
+ memcpy (&elt_indices[n_available], intfc->rx_queue,
+ vec_len (intfc->rx_queue) * sizeof (u32));
+
+ n_available += vec_len (intfc->rx_queue);
+ sh->opaque[CHUNK_POOL_NFREE] = (void *) (u64) n_available;
+
+ ssvm_unlock (sh);
+
+ vlib_error_count (vm, node->node_index, SSVM_ETH_INPUT_ERROR_NO_BUFFERS,
+ n_buffers);
+
+ vlib_increment_combined_counter
+ (vnet_get_main()->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX, cpu_index,
+ intfc->vlib_hw_if_index,
+ rx_queue_index, n_rx_bytes);
+
+ return rx_queue_index;
+}
+
+static uword
+ssvm_eth_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc;
+ uword n_rx_packets = 0;
+
+ vec_foreach (intfc, em->intfcs)
+ {
+ n_rx_packets += ssvm_eth_device_input (em, intfc, node);
+ }
+
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (ssvm_eth_input_node) = {
+ .function = ssvm_eth_input_node_fn,
+ .name = "ssvm_eth_input",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ssvm_eth_input_trace,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .n_errors = ARRAY_LEN(ssvm_eth_input_error_strings),
+ .error_strings = ssvm_eth_input_error_strings,
+
+ .n_next_nodes = SSVM_ETH_INPUT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [SSVM_ETH_INPUT_NEXT_DROP] = "error-drop",
+ [SSVM_ETH_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [SSVM_ETH_INPUT_NEXT_IP4_INPUT] = "ip4-input",
+ [SSVM_ETH_INPUT_NEXT_IP6_INPUT] = "ip6-input",
+ [SSVM_ETH_INPUT_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ },
+};
+
diff --git a/vnet/vnet/devices/ssvm/ssvm_eth.c b/vnet/vnet/devices/ssvm/ssvm_eth.c
new file mode 100644
index 00000000000..aad63f02bba
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/ssvm_eth.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ssvm_eth.h"
+
+ssvm_eth_main_t ssvm_eth_main;
+
+#define foreach_ssvm_eth_tx_func_error \
+_(RING_FULL, "Tx packet drops (ring full)") \
+_(NO_BUFFERS, "Tx packet drops (no buffers)") \
+_(ADMIN_DOWN, "Tx packet drops (admin down)")
+
+typedef enum {
+#define _(f,s) SSVM_ETH_TX_ERROR_##f,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+ SSVM_ETH_TX_N_ERROR,
+} ssvm_eth_tx_func_error_t;
+
+static u32 ssvm_eth_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags);
+
+int ssvm_eth_create (ssvm_eth_main_t * em, u8 * name, int is_master)
+{
+ ssvm_private_t * intfc;
+ void * oldheap;
+ clib_error_t * e;
+ unix_shared_memory_queue_t * q;
+ ssvm_shared_header_t * sh;
+ ssvm_eth_queue_elt_t * elts;
+ u32 * elt_indices;
+ u8 enet_addr[6];
+ int i, rv;
+
+ vec_add2 (em->intfcs, intfc, 1);
+
+ intfc->ssvm_size = em->segment_size;
+ intfc->i_am_master = 1;
+ intfc->name = name;
+ if (is_master == 0)
+ {
+ rv = ssvm_slave_init (intfc, 20 /* timeout in seconds */);
+ if (rv < 0)
+ return rv;
+ goto create_vnet_interface;
+ }
+
+ intfc->requested_va = em->next_base_va;
+ em->next_base_va += em->segment_size;
+ rv = ssvm_master_init (intfc, intfc - em->intfcs /* master index */);
+
+ if (rv < 0)
+ return rv;
+
+ /* OK, segment created, set up queues and so forth. */
+
+ sh = intfc->sh;
+ oldheap = ssvm_push_heap (sh);
+
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */,
+ 0 /* signal not sent */);
+ sh->opaque [TO_MASTER_Q_INDEX] = (void *)q;
+ q = unix_shared_memory_queue_init (em->queue_elts, sizeof (u32),
+ 0 /* consumer pid not interesting */,
+ 0 /* signal not sent */);
+ sh->opaque [TO_SLAVE_Q_INDEX] = (void *)q;
+
+ /*
+ * Preallocate the requested number of buffer chunks
+ * There must be a better way to do this, etc.
+ * Add some slop to avoid pool reallocation, which will not go well
+ */
+ elts = 0;
+ elt_indices = 0;
+
+ vec_validate_aligned (elts, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (elt_indices, em->nbuffers - 1, CLIB_CACHE_LINE_BYTES);
+
+ for (i = 0; i < em->nbuffers; i++)
+ elt_indices[i] = i;
+
+ sh->opaque [CHUNK_POOL_INDEX] = (void *) elts;
+ sh->opaque [CHUNK_POOL_FREELIST_INDEX] = (void *) elt_indices;
+ sh->opaque [CHUNK_POOL_NFREE] = (void *) em->nbuffers;
+
+ ssvm_pop_heap (oldheap);
+
+ create_vnet_interface:
+
+ sh = intfc->sh;
+
+ memset (enet_addr, 0, sizeof (enet_addr));
+ enet_addr[0] = 2;
+ enet_addr[1] = 0xFE;
+ enet_addr[2] = is_master;
+ enet_addr[5] = sh->master_index;
+
+ e = ethernet_register_interface
+ (em->vnet_main, ssvm_eth_device_class.index,
+ intfc - em->intfcs,
+ /* ethernet address */ enet_addr,
+ &intfc->vlib_hw_if_index,
+ ssvm_eth_flag_change);
+
+ if (e)
+ {
+ clib_error_report (e);
+ /* $$$$ unmap offending region? */
+ return VNET_API_ERROR_INVALID_INTERFACE;
+ }
+
+ /* Declare link up */
+ vnet_hw_interface_set_flags (em->vnet_main, intfc->vlib_hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ /* Let the games begin... */
+ if (is_master)
+ sh->ready = 1;
+ return 0;
+}
+
+static clib_error_t *
+ssvm_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ u8 * name;
+ int is_master = 1;
+ int i, rv;
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "base-va %llx", &em->next_base_va))
+ ;
+ else if (unformat (input, "segment-size %lld", &em->segment_size))
+ em->segment_size = 1ULL << (max_log2 (em->segment_size));
+ else if (unformat (input, "nbuffers %lld", &em->nbuffers))
+ ;
+ else if (unformat (input, "queue-elts %lld", &em->queue_elts))
+ ;
+ else if (unformat (input, "slave"))
+ is_master = 0;
+ else if (unformat (input, "%s", &name))
+ vec_add1 (em->names, name);
+ else
+ break;
+ }
+
+ /* No configured instances, we're done... */
+ if (vec_len (em->names) == 0)
+ return 0;
+
+ for (i = 0; i < vec_len (em->names); i++)
+ {
+ rv = ssvm_eth_create (em, em->names[i], is_master);
+ if (rv < 0)
+ return clib_error_return (0, "ssvm_eth_create '%s' failed, error %d",
+ em->names[i], rv);
+ }
+
+ vlib_node_set_state (vm, ssvm_eth_input_node.index, VLIB_NODE_STATE_POLLING);
+
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (ssvm_config, "ssvm_eth");
+
+
+static clib_error_t * ssvm_eth_init (vlib_main_t * vm)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+
+ if (((sizeof(ssvm_eth_queue_elt_t) / CLIB_CACHE_LINE_BYTES)
+ * CLIB_CACHE_LINE_BYTES) != sizeof(ssvm_eth_queue_elt_t))
+ clib_warning ("ssvm_eth_queue_elt_t size %d not a multiple of %d",
+ sizeof(ssvm_eth_queue_elt_t), CLIB_CACHE_LINE_BYTES);
+
+ em->vlib_main = vm;
+ em->vnet_main = vnet_get_main();
+ em->elog_main = &vm->elog_main;
+
+ /* default config param values... */
+
+ em->next_base_va = 0x600000000ULL;
+ /*
+ * Allocate 2 full superframes in each dir (256 x 2 x 2 x 2048 bytes),
+ * 2mb; double that so we have plenty of space... 4mb
+ */
+ em->segment_size = 8<<20;
+ em->nbuffers = 1024;
+ em->queue_elts = 512;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ssvm_eth_init);
+
+static char * ssvm_eth_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_ssvm_eth_tx_func_error
+#undef _
+};
+
+static u8 * format_ssvm_eth_device_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+
+ s = format (s, "ssvmEthernet%d", i);
+ return s;
+}
+
+static u8 * format_ssvm_eth_device (u8 * s, va_list * args)
+{
+ s = format (s, "SSVM Ethernet");
+ return s;
+}
+
+static u8 * format_ssvm_eth_tx_trace (u8 * s, va_list * args)
+{
+ s = format (s, "Unimplemented...");
+ return s;
+}
+
+
+static uword
+ssvm_eth_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ ssvm_private_t * intfc = vec_elt_at_index (em->intfcs, rd->dev_instance);
+ ssvm_shared_header_t * sh = intfc->sh;
+ unix_shared_memory_queue_t * q;
+ u32 * from;
+ u32 n_left;
+ ssvm_eth_queue_elt_t * elts, * elt, * prev_elt;
+ u32 my_pid = intfc->my_pid;
+ vlib_buffer_t * b0;
+ u32 bi0;
+ u32 size_this_buffer;
+ u32 chunks_this_buffer;
+ u8 i_am_master = intfc->i_am_master;
+ u32 elt_index;
+ int is_ring_full, interface_down;
+ int i;
+ volatile u32 *queue_lock;
+ u32 n_to_alloc = VLIB_FRAME_SIZE;
+ u32 n_allocated, n_present_in_cache, n_available;
+ u32 * elt_indices;
+
+ if (i_am_master)
+ q = (unix_shared_memory_queue_t *)sh->opaque [TO_SLAVE_Q_INDEX];
+ else
+ q = (unix_shared_memory_queue_t *)sh->opaque [TO_MASTER_Q_INDEX];
+
+ queue_lock = (u32 *) q;
+
+ from = vlib_frame_vector_args (f);
+ n_left = f->n_vectors;
+ is_ring_full = 0;
+ interface_down = 0;
+
+ n_present_in_cache = vec_len (em->chunk_cache);
+
+ /* admin / link up/down check */
+ if ((u64)(sh->opaque [MASTER_ADMIN_STATE_INDEX]) == 0 ||
+ (u64)(sh->opaque [SLAVE_ADMIN_STATE_INDEX]) == 0)
+ {
+ interface_down = 1;
+ goto out;
+ }
+
+ ssvm_lock (sh, my_pid, 1);
+
+ elts = (ssvm_eth_queue_elt_t *) (sh->opaque [CHUNK_POOL_INDEX]);
+ elt_indices = (u32 *) (sh->opaque [CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) (u64) (sh->opaque [CHUNK_POOL_NFREE]);
+
+ if (n_present_in_cache < n_left*2)
+ {
+ vec_validate (em->chunk_cache,
+ n_to_alloc + n_present_in_cache - 1);
+
+ n_allocated = n_to_alloc < n_available ? n_to_alloc : n_available;
+
+ if (PREDICT_TRUE(n_allocated > 0))
+ {
+ memcpy (&em->chunk_cache[n_present_in_cache],
+ &elt_indices[n_available - n_allocated],
+ sizeof(u32) * n_allocated);
+ }
+
+ n_present_in_cache += n_allocated;
+ n_available -= n_allocated;
+ sh->opaque [CHUNK_POOL_NFREE] = (void *) (u64) n_available;
+ _vec_len (em->chunk_cache) = n_present_in_cache;
+ }
+
+ ssvm_unlock (sh);
+
+ while (n_left)
+ {
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+
+ size_this_buffer = vlib_buffer_length_in_chain (vm, b0);
+ chunks_this_buffer = (size_this_buffer + (SSVM_BUFFER_SIZE - 1))
+ / SSVM_BUFFER_SIZE;
+
+ /* If we're not going to be able to enqueue the buffer, tail drop. */
+ if (q->cursize >= q->maxsize)
+ {
+ is_ring_full = 1;
+ break;
+ }
+
+ prev_elt = 0;
+ elt_index = ~0;
+ for (i = 0; i < chunks_this_buffer; i++)
+ {
+ if (PREDICT_FALSE (n_present_in_cache == 0))
+ goto out;
+
+ elt_index = em->chunk_cache[--n_present_in_cache];
+ elt = elts + elt_index;
+
+ elt->type = SSVM_PACKET_TYPE;
+ elt->flags = 0;
+ elt->total_length_not_including_first_buffer =
+ b0->total_length_not_including_first_buffer;
+ elt->length_this_buffer = b0->current_length;
+ elt->current_data_hint = b0->current_data;
+ elt->owner = !i_am_master;
+ elt->tag = 1;
+
+ memcpy (elt->data, b0->data + b0->current_data, b0->current_length);
+
+ if (PREDICT_FALSE (prev_elt != 0))
+ prev_elt->next_index = elt - elts;
+
+ if (PREDICT_FALSE(i < (chunks_this_buffer-1)))
+ {
+ elt->flags = SSVM_BUFFER_NEXT_PRESENT;
+ ASSERT (b0->flags & VLIB_BUFFER_NEXT_PRESENT);
+ b0 = vlib_get_buffer (vm, b0->next_buffer);
+ }
+ prev_elt = elt;
+ }
+
+ while (__sync_lock_test_and_set (queue_lock, 1))
+ ;
+
+ unix_shared_memory_queue_add_raw (q, (u8 *)&elt_index);
+ CLIB_MEMORY_BARRIER();
+ *queue_lock = 0;
+
+ from++;
+ n_left--;
+ }
+
+ out:
+ if (PREDICT_FALSE(n_left))
+ {
+ if (is_ring_full)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_RING_FULL,
+ n_left);
+ else if (interface_down)
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_ADMIN_DOWN,
+ n_left);
+ else
+ vlib_error_count (vm, node->node_index, SSVM_ETH_TX_ERROR_NO_BUFFERS,
+ n_left);
+
+ vlib_buffer_free (vm, from, n_left);
+ }
+ else
+ vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+
+ if (PREDICT_TRUE(vec_len(em->chunk_cache)))
+ _vec_len(em->chunk_cache) = n_present_in_cache;
+
+ return f->n_vectors;
+}
+
+static void ssvm_eth_clear_hw_interface_counters (u32 instance)
+{
+ /* Nothing for now */
+}
+
+static clib_error_t *
+ssvm_eth_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc = vec_elt_at_index (em->intfcs, hif->dev_instance);
+ ssvm_shared_header_t * sh;
+
+ /* publish link-state in shared-memory, to discourage buffer-wasting */
+ sh = intfc->sh;
+ if (intfc->i_am_master)
+ sh->opaque [MASTER_ADMIN_STATE_INDEX] = (void *) is_up;
+ else
+ sh->opaque [SLAVE_ADMIN_STATE_INDEX] = (void *) is_up;
+
+ return 0;
+}
+
+static clib_error_t *
+ssvm_eth_subif_add_del_function (vnet_main_t * vnm,
+ u32 hw_if_index,
+ struct vnet_sw_interface_t * st,
+ int is_add)
+{
+ /* Nothing for now */
+ return 0;
+}
+
+/*
+ * Dynamically redirect all pkts from a specific interface
+ * to the specified node
+ */
+static void
+ssvm_eth_set_interface_next_node (vnet_main_t *vnm, u32 hw_if_index,
+ u32 node_index)
+{
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+ ssvm_private_t * intfc = pool_elt_at_index (em->intfcs, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ intfc->per_interface_next_index = node_index;
+ return;
+ }
+
+ intfc->per_interface_next_index =
+ vlib_node_add_next (em->vlib_main, ssvm_eth_input_node.index, node_index);
+}
+
+static u32 ssvm_eth_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 flags)
+{
+ /* nothing for now */
+ return 0;
+}
+
+VNET_DEVICE_CLASS (ssvm_eth_device_class) = {
+ .name = "ssvm-eth",
+ .tx_function = ssvm_eth_interface_tx,
+ .tx_function_n_errors = SSVM_ETH_TX_N_ERROR,
+ .tx_function_error_strings = ssvm_eth_tx_func_error_strings,
+ .format_device_name = format_ssvm_eth_device_name,
+ .format_device = format_ssvm_eth_device,
+ .format_tx_trace = format_ssvm_eth_tx_trace,
+ .clear_counters = ssvm_eth_clear_hw_interface_counters,
+ .admin_up_down_function = ssvm_eth_interface_admin_up_down,
+ .subif_add_del_function = ssvm_eth_subif_add_del_function,
+ .rx_redirect_to_node = ssvm_eth_set_interface_next_node,
+ .no_flatten_output_chains = 1,
+};
diff --git a/vnet/vnet/devices/ssvm/ssvm_eth.h b/vnet/vnet/devices/ssvm/ssvm_eth.h
new file mode 100644
index 00000000000..1b077220305
--- /dev/null
+++ b/vnet/vnet/devices/ssvm/ssvm_eth.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ssvm_eth_h__
+#define __included_ssvm_eth_h__
+
+#include <vnet/vnet.h>
+
+#include <vppinfra/elog.h>
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/elog.h>
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+#include <vlibmemory/unix_shared_memory_queue.h>
+
+#include <ssvm.h>
+
+vnet_device_class_t ssvm_eth_device_class;
+vlib_node_registration_t ssvm_eth_input_node;
+
+#define SSVM_BUFFER_SIZE \
+ (VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES + VLIB_BUFFER_PRE_DATA_SIZE)
+#define SSVM_PACKET_TYPE 1
+
+typedef struct {
+ /* Type of queue element */
+ u8 type;
+ u8 flags;
+#define SSVM_BUFFER_NEXT_PRESENT (1<<0)
+ u8 owner;
+ u8 tag;
+ i16 current_data_hint;
+ u16 length_this_buffer;
+ u16 total_length_not_including_first_buffer;
+ u16 pad;
+ u32 next_index;
+ /* offset 16 */
+ u8 data [SSVM_BUFFER_SIZE];
+ /* pad to an even multiple of 64 octets */
+ u8 pad2[CLIB_CACHE_LINE_BYTES - 16];
+} ssvm_eth_queue_elt_t;
+
+typedef struct {
+ /* vector of point-to-point connections */
+ ssvm_private_t * intfcs;
+
+ u32 * buffer_cache;
+ u32 * chunk_cache;
+
+ /* Configurable parameters */
+ /* base address for next placement */
+ u64 next_base_va;
+ u64 segment_size;
+ u64 nbuffers;
+ u64 queue_elts;
+
+ /* Segment names */
+ u8 ** names;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ elog_main_t * elog_main;
+} ssvm_eth_main_t;
+
+ssvm_eth_main_t ssvm_eth_main;
+
+typedef enum {
+ CHUNK_POOL_FREELIST_INDEX = 0,
+ CHUNK_POOL_INDEX,
+ CHUNK_POOL_NFREE,
+ TO_MASTER_Q_INDEX,
+ TO_SLAVE_Q_INDEX,
+ MASTER_ADMIN_STATE_INDEX,
+ SLAVE_ADMIN_STATE_INDEX,
+} ssvm_eth_opaque_index_t;
+
+/*
+ * debug scaffolding.
+ */
+static inline void ssvm_eth_validate_freelists (int need_lock)
+{
+#if CLIB_DEBUG > 0
+ ssvm_eth_main_t * em = &ssvm_eth_main;
+ ssvm_private_t * intfc;
+ ssvm_shared_header_t * sh;
+ u32 * elt_indices;
+ u32 n_available;
+ int i;
+
+ for (i = 0; i < vec_len (em->intfcs); i++)
+ {
+ intfc = em->intfcs + i;
+ sh = intfc->sh;
+ u32 my_pid = intfc->my_pid;
+
+ if (need_lock)
+ ssvm_lock (sh, my_pid, 15);
+
+ elt_indices = (u32 *) (sh->opaque [CHUNK_POOL_FREELIST_INDEX]);
+ n_available = (u32) (u64) (sh->opaque [CHUNK_POOL_NFREE]);
+
+ for (i = 0; i < n_available; i++)
+ ASSERT (elt_indices[i] < 2048);
+
+ if (need_lock)
+ ssvm_unlock (sh);
+ }
+#endif
+}
+
+#endif /* __included_ssvm_eth_h__ */
diff --git a/vnet/vnet/devices/virtio/vhost-user.c b/vnet/vnet/devices/virtio/vhost-user.c
new file mode 100644
index 00000000000..4df025c21b6
--- /dev/null
+++ b/vnet/vnet/devices/virtio/vhost-user.c
@@ -0,0 +1,1957 @@
+/*
+ *------------------------------------------------------------------
+ * vhost.c - vhost-user
+ *
+ * Copyright (c) 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+#include <sys/vfs.h>
+
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/devices/virtio/vhost-user.h>
+
+#define VHOST_USER_DEBUG_SOCKET 0
+#define VHOST_USER_DEBUG_VQ 0
+
+/* Set to get virtio_net_hdr in buffer pre-data
+ details will be shown in packet trace */
+#define VHOST_USER_COPY_TX_HDR 0
+
+#if VHOST_USER_DEBUG_SOCKET == 1
+#define DBG_SOCK(args...) clib_warning(args);
+#else
+#define DBG_SOCK(args...)
+#endif
+
+#if VHOST_USER_DEBUG_VQ == 1
+#define DBG_VQ(args...) clib_warning(args);
+#else
+#define DBG_VQ(args...)
+#endif
+
+vlib_node_registration_t vhost_user_input_node;
+
+#define foreach_vhost_user_tx_func_error \
+ _(PKT_DROP_NOBUF, "tx packet drops (no available descriptors)") \
+ _(MMAP_FAIL, "mmap failure")
+
+typedef enum {
+#define _(f,s) VHOST_USER_TX_FUNC_ERROR_##f,
+ foreach_vhost_user_tx_func_error
+#undef _
+ VHOST_USER_TX_FUNC_N_ERROR,
+} vhost_user_tx_func_error_t;
+
+static char * vhost_user_tx_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_tx_func_error
+#undef _
+};
+
+#define foreach_vhost_user_input_func_error \
+ _(NO_ERROR, "no error") \
+ _(UNDERSIZED_FRAME, "undersized ethernet frame received (< 14 bytes)")
+
+typedef enum {
+#define _(f,s) VHOST_USER_INPUT_FUNC_ERROR_##f,
+ foreach_vhost_user_input_func_error
+#undef _
+ VHOST_USER_INPUT_FUNC_N_ERROR,
+} vhost_user_input_func_error_t;
+
+static char * vhost_user_input_func_error_strings[] = {
+#define _(n,s) s,
+ foreach_vhost_user_input_func_error
+#undef _
+};
+
+static vhost_user_main_t vhost_user_main = {
+ .mtu_bytes = 1518,
+};
+
+VNET_HW_INTERFACE_CLASS (vhost_interface_class, static) = {
+ .name = "vhost-user",
+};
+
+static u8 * format_vhost_user_interface_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ if (i < vec_len (vum->show_dev_instance_by_real_dev_instance))
+ show_dev_instance = vum->show_dev_instance_by_real_dev_instance[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ s = format (s, "VirtualEthernet0/0/%d", i);
+ return s;
+}
+
+static int vhost_user_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ vec_validate_init_empty (vum->show_dev_instance_by_real_dev_instance,
+ hi->dev_instance, ~0);
+
+ vum->show_dev_instance_by_real_dev_instance [hi->dev_instance] =
+ new_dev_instance;
+
+ DBG_SOCK("renumbered vhost-user interface dev_instance %d to %d",
+ hi->dev_instance, new_dev_instance);
+
+ return 0;
+}
+
+
+static inline void * map_guest_mem(vhost_user_intf_t * vui, u64 addr)
+{
+ int i;
+ for (i=0; i<vui->nregions; i++) {
+ if ((vui->regions[i].guest_phys_addr <= addr) &&
+ ((vui->regions[i].guest_phys_addr + vui->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_mmap_addr[i] + addr - vui->regions[i].guest_phys_addr);
+ }
+ }
+ DBG_VQ("failed to map guest mem addr %llx", addr);
+ return 0;
+}
+
+static inline void * map_user_mem(vhost_user_intf_t * vui, u64 addr)
+{
+ int i;
+ for (i=0; i<vui->nregions; i++) {
+ if ((vui->regions[i].userspace_addr <= addr) &&
+ ((vui->regions[i].userspace_addr + vui->regions[i].memory_size) > addr)) {
+ return (void *) (vui->region_mmap_addr[i] + addr - vui->regions[i].userspace_addr);
+ }
+ }
+ return 0;
+}
+
+static long get_huge_page_size(int fd)
+{
+ struct statfs s;
+ fstatfs(fd, &s);
+ return s.f_bsize;
+}
+
+static void unmap_all_mem_regions(vhost_user_intf_t * vui)
+{
+ int i,r;
+ for (i=0; i<vui->nregions; i++) {
+ if (vui->region_mmap_addr[i] != (void *) -1) {
+
+ long page_sz = get_huge_page_size(vui->region_mmap_fd[i]);
+
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset + page_sz) & ~(page_sz - 1);
+
+ r = munmap(vui->region_mmap_addr[i] - vui->regions[i].mmap_offset, map_sz);
+
+ DBG_SOCK("unmap memory region %d addr 0x%lx len 0x%lx page_sz 0x%x", i,
+ vui->region_mmap_addr[i], map_sz, page_sz);
+
+ vui->region_mmap_addr[i]= (void *) -1;
+
+ if (r == -1) {
+ clib_warning("failed to unmap memory region (errno %d)", errno);
+ }
+ close(vui->region_mmap_fd[i]);
+ }
+ }
+ vui->nregions = 0;
+}
+
+
+static clib_error_t * vhost_user_callfd_read_ready (unix_file_t * uf)
+{
+ __attribute__((unused)) int n;
+ u8 buff[8];
+ n = read(uf->file_descriptor, ((char*)&buff), 8);
+ return 0;
+}
+
+static inline void vhost_user_if_disconnect(vhost_user_intf_t * vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vnet_main_t * vnm = vnet_get_main();
+ int q;
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (vui->unix_file_index != ~0) {
+ unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index);
+ vui->unix_file_index = ~0;
+ }
+
+ hash_unset(vum->vhost_user_interface_index_by_sock_fd, vui->unix_fd);
+ hash_unset(vum->vhost_user_interface_index_by_listener_fd, vui->unix_fd);
+ close(vui->unix_fd);
+ vui->unix_fd = -1;
+ vui->is_up = 0;
+ for (q = 0; q < vui->num_vrings; q++) {
+ vui->vrings[q].desc = NULL;
+ vui->vrings[q].avail = NULL;
+ vui->vrings[q].used = NULL;
+ }
+
+ unmap_all_mem_regions(vui);
+ DBG_SOCK("interface ifindex %d disconnected", vui->sw_if_index);
+}
+
+static clib_error_t * vhost_user_socket_read (unix_file_t * uf)
+{
+ int n, i;
+ int fd, number_of_fds = 0;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_msg_t msg;
+ struct msghdr mh;
+ struct iovec iov[1];
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ struct cmsghdr *cmsg;
+ uword * p;
+ u8 q;
+ unix_file_t template = {0};
+ vnet_main_t * vnm = vnet_get_main();
+
+ p = hash_get (vum->vhost_user_interface_index_by_sock_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("FD %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))];
+
+ memset(&mh, 0, sizeof(mh));
+ memset(control, 0, sizeof(control));
+
+ /* set the payload */
+ iov[0].iov_base = (void *) &msg;
+ iov[0].iov_len = VHOST_USER_MSG_HDR_SZ;
+
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = control;
+ mh.msg_controllen = sizeof(control);
+
+ n = recvmsg(uf->file_descriptor, &mh, 0);
+
+ if (n != VHOST_USER_MSG_HDR_SZ)
+ goto close_socket;
+
+ if (mh.msg_flags & MSG_CTRUNC) {
+ goto close_socket;
+ }
+
+ cmsg = CMSG_FIRSTHDR(&mh);
+
+ if (cmsg && (cmsg->cmsg_len > 0) && (cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS) &&
+ (cmsg->cmsg_len - CMSG_LEN(0) <= VHOST_MEMORY_MAX_NREGIONS * sizeof(int))) {
+ number_of_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ memcpy(fds, CMSG_DATA(cmsg), number_of_fds * sizeof(int));
+ }
+
+ /* version 1, no reply bit set*/
+ if ((msg.flags & 7) != 1) {
+ DBG_SOCK("malformed message received. closing socket");
+ goto close_socket;
+ }
+
+ {
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = read(uf->file_descriptor, ((char*)&msg) + n, msg.size);
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_GET_FEATURES",
+ vui->hw_if_index);
+
+ msg.flags |= 4;
+ msg.u64 = (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF) |
+ (1 << FEAT_VIRTIO_F_ANY_LAYOUT);
+ msg.u64 &= vui->feature_mask;
+
+ msg.size = sizeof(msg.u64);
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ DBG_SOCK("if %d msg VHOST_USER_SET_FEATURES features 0x%016llx",
+ vui->hw_if_index, msg.u64);
+
+ vui->features = msg.u64;
+ if (vui->features & (1 << FEAT_VIRTIO_NET_F_MRG_RXBUF))
+ vui->virtio_net_hdr_sz = 12;
+ else
+ vui->virtio_net_hdr_sz = 10;
+
+ vui->is_any_layout = (vui->features & (1 << FEAT_VIRTIO_F_ANY_LAYOUT)) ? 1 : 0;
+
+ ASSERT (vui->virtio_net_hdr_sz < VLIB_BUFFER_PRE_DATA_SIZE);
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+ vui->is_up = 0;
+
+ for (q = 0; q < 2; q++) {
+ vui->vrings[q].desc = 0;
+ vui->vrings[q].avail = 0;
+ vui->vrings[q].used = 0;
+ }
+
+ DBG_SOCK("interface %d disconnected", vui->sw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_MEM_TABLE nregions %d",
+ vui->hw_if_index, msg.memory.nregions);
+
+ if ((msg.memory.nregions < 1) ||
+ (msg.memory.nregions > VHOST_MEMORY_MAX_NREGIONS)) {
+
+ DBG_SOCK("number of mem regions must be between 1 and %i",
+ VHOST_MEMORY_MAX_NREGIONS);
+
+ goto close_socket;
+ }
+
+ if (msg.memory.nregions != number_of_fds) {
+ DBG_SOCK("each memory region must have FD");
+ goto close_socket;
+ }
+ unmap_all_mem_regions(vui);
+ for(i=0; i < msg.memory.nregions; i++) {
+ memcpy(&(vui->regions[i]), &msg.memory.regions[i],
+ sizeof(vhost_user_memory_region_t));
+
+ long page_sz = get_huge_page_size(fds[i]);
+
+ /* align size to 2M page */
+ ssize_t map_sz = (vui->regions[i].memory_size +
+ vui->regions[i].mmap_offset + page_sz) & ~(page_sz - 1);
+
+ vui->region_mmap_addr[i] = mmap(0, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fds[i], 0);
+
+ DBG_SOCK("map memory region %d addr 0 len 0x%lx fd %d mapped 0x%lx "
+ "page_sz 0x%x", i, map_sz, fds[i], vui->region_mmap_addr[i], page_sz);
+
+ if (vui->region_mmap_addr[i] == MAP_FAILED) {
+ clib_warning("failed to map memory. errno is %d", errno);
+ goto close_socket;
+ }
+ vui->region_mmap_addr[i] += vui->regions[i].mmap_offset;
+ vui->region_mmap_fd[i] = fds[i];
+ }
+ vui->nregions = msg.memory.nregions;
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_NUM idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ if ((msg.state.num > 32768) || /* maximum ring size is 32768 */
+ (msg.state.num == 0) || /* it cannot be zero */
+ (msg.state.num % 2)) /* must be power of 2 */
+ goto close_socket;
+ vui->vrings[msg.state.index].qsz = msg.state.num;
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ADDR idx %d",
+ vui->hw_if_index, msg.state.index);
+
+ vui->vrings[msg.state.index].desc = (vring_desc_t *)
+ map_user_mem(vui, msg.addr.desc_user_addr);
+ vui->vrings[msg.state.index].used = (vring_used_t *)
+ map_user_mem(vui, msg.addr.used_user_addr);
+ vui->vrings[msg.state.index].avail = (vring_avail_t *)
+ map_user_mem(vui, msg.addr.avail_user_addr);
+
+ if ((vui->vrings[msg.state.index].desc == NULL) ||
+ (vui->vrings[msg.state.index].used == NULL) ||
+ (vui->vrings[msg.state.index].avail == NULL)) {
+ DBG_SOCK("failed to map user memory for hw_if_index %d",
+ vui->hw_if_index);
+ goto close_socket;
+ }
+
+ vui->vrings[msg.state.index].last_used_idx =
+ vui->vrings[msg.state.index].used->idx;
+
+ /* tell driver that we don't want interrupts */
+ vui->vrings[msg.state.index].used->flags |= 1;
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_SET_OWNER",
+ vui->hw_if_index);
+ break;
+
+ case VHOST_USER_RESET_OWNER:
+ DBG_SOCK("if %d msg VHOST_USER_RESET_OWNER",
+ vui->hw_if_index);
+ break;
+
+ case VHOST_USER_SET_VRING_CALL:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_CALL u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ /* if there is old fd, delete it */
+ if (vui->vrings[q].callfd) {
+ unix_file_t * uf = pool_elt_at_index (unix_main.file_pool,
+ vui->vrings[q].callfd_idx);
+ unix_file_del (&unix_main, uf);
+ }
+ vui->vrings[q].callfd = fds[0];
+ template.read_function = vhost_user_callfd_read_ready;
+ template.file_descriptor = fds[0];
+ vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template);
+ }
+ else
+ vui->vrings[q].callfd = -1;
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_KICK u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ vui->vrings[q].kickfd = fds[0];
+ }
+ else
+ vui->vrings[q].kickfd = -1;
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_ERR u64 %d",
+ vui->hw_if_index, msg.u64);
+
+ q = (u8) (msg.u64 & 0xFF);
+
+ if (!(msg.u64 & 0x100))
+ {
+ if (number_of_fds != 1)
+ goto close_socket;
+
+ fd = fds[0];
+ }
+ else
+ fd = -1;
+
+ vui->vrings[q].errfd = fd;
+ break;
+
+ case VHOST_USER_SET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ vui->vrings[msg.state.index].last_avail_idx = msg.state.num;
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_GET_VRING_BASE idx %d num %d",
+ vui->hw_if_index, msg.state.index, msg.state.num);
+
+ msg.state.num = vui->vrings[msg.state.index].last_used_idx;
+ msg.flags |= 4;
+ msg.size = sizeof(msg.state);
+ break;
+
+ case VHOST_USER_NONE:
+ DBG_SOCK("if %d msg VHOST_USER_NONE",
+ vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_BASE",
+ vui->hw_if_index);
+
+ break;
+
+ case VHOST_USER_SET_LOG_FD:
+ DBG_SOCK("if %d msg VHOST_USER_SET_LOG_FD",
+ vui->hw_if_index);
+
+ break;
+
+ default:
+ DBG_SOCK("unknown vhost-user message %d received. closing socket",
+ msg.request);
+ goto close_socket;
+ }
+
+ /* if we have pointers to descriptor table, go up*/
+ if (!vui->is_up &&
+ vui->vrings[VHOST_NET_VRING_IDX_TX].desc &&
+ vui->vrings[VHOST_NET_VRING_IDX_RX].desc) {
+
+ DBG_SOCK("interface %d connected", vui->sw_if_index);
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP);
+ vui->is_up = 1;
+
+ }
+
+ /* if we need to reply */
+ if (msg.flags & 4)
+ {
+ n = send(uf->file_descriptor, &msg, VHOST_USER_MSG_HDR_SZ + msg.size, 0);
+ if (n != (msg.size + VHOST_USER_MSG_HDR_SZ))
+ goto close_socket;
+ }
+
+ return 0;
+
+close_socket:
+ vhost_user_if_disconnect(vui);
+ return 0;
+}
+
+static clib_error_t * vhost_user_socket_error (unix_file_t * uf)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword * p;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sock_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ vhost_user_if_disconnect(vui);
+ return 0;
+}
+
+static clib_error_t * vhost_user_socksvr_accept_ready (unix_file_t * uf)
+{
+ int client_fd, client_len;
+ struct sockaddr_un client;
+ unix_file_t template = {0};
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword * p;
+
+ p = hash_get (vum->vhost_user_interface_index_by_listener_fd,
+ uf->file_descriptor);
+ if (p == 0) {
+ DBG_SOCK ("fd %d doesn't belong to any interface",
+ uf->file_descriptor);
+ return 0;
+ }
+ else
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+
+ client_len = sizeof(client);
+ client_fd = accept (uf->file_descriptor,
+ (struct sockaddr *)&client,
+ (socklen_t *)&client_len);
+
+ if (client_fd < 0)
+ return clib_error_return_unix (0, "accept");
+
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+ template.file_descriptor = client_fd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+
+ vui->client_fd = client_fd;
+ hash_set (vum->vhost_user_interface_index_by_sock_fd, vui->client_fd,
+ vui - vum->vhost_user_interfaces);
+
+ return 0;
+}
+
+static clib_error_t *
+vhost_user_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ error = vlib_call_init_function (vm, ip4_init);
+ if (error)
+ return error;
+
+ vum->vhost_user_interface_index_by_listener_fd = hash_create (0, sizeof (uword));
+ vum->vhost_user_interface_index_by_sock_fd = hash_create (0, sizeof (uword));
+ vum->vhost_user_interface_index_by_sw_if_index = hash_create (0, sizeof (uword));
+ vum->coalesce_frames = 32;
+ vum->coalesce_time = 1e-3;
+
+ vec_validate_aligned (vum->rx_buffers, tm->n_vlib_mains - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (vhost_user_init);
+
+static clib_error_t *
+vhost_user_exit (vlib_main_t * vm)
+{
+ /* TODO cleanup */
+ return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (vhost_user_exit);
+
+enum {
+ VHOST_USER_RX_NEXT_ETHERNET_INPUT,
+ VHOST_USER_RX_NEXT_DROP,
+ VHOST_USER_RX_N_NEXT,
+};
+
+
+typedef struct {
+ u16 virtqueue;
+ u16 device_index;
+#if VHOST_USER_COPY_TX_HDR == 1
+ virtio_net_hdr_t hdr;
+#endif
+} vhost_user_input_trace_t;
+
+static u8 * format_vhost_user_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ CLIB_UNUSED (vnet_main_t * vnm) = vnet_get_main();
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_input_trace_t * t = va_arg (*va, vhost_user_input_trace_t *);
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces,
+ t->device_index);
+
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, vui->sw_if_index);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ uword indent = format_get_indent (s);
+#endif
+
+ s = format (s, "%U virtqueue %d",
+ format_vnet_sw_interface_name, vnm, sw,
+ t->virtqueue);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ s = format (s, "\n%Uvirtio_net_hdr flags 0x%02x gso_type %u hdr_len %u",
+ format_white_space, indent,
+ t->hdr.flags,
+ t->hdr.gso_type,
+ t->hdr.hdr_len);
+#endif
+
+ return s;
+}
+
+void vhost_user_rx_trace (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vhost_user_intf_t *vui,
+ i16 virtqueue)
+{
+ u32 * b, n_left;
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ u32 next_index = VHOST_USER_RX_NEXT_ETHERNET_INPUT;
+
+ n_left = vec_len(vui->d_trace_buffers);
+ b = vui->d_trace_buffers;
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ vhost_user_input_trace_t * t0;
+
+ bi0 = b[0];
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ vlib_trace_buffer (vm, node, next_index, b0, /* follow_chain */ 0);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->virtqueue = virtqueue;
+ t0->device_index = vui - vum->vhost_user_interfaces;
+#if VHOST_USER_COPY_TX_HDR == 1
+ rte_memcpy(&t0->hdr, b0->pre_data, sizeof(virtio_net_hdr_t));
+#endif
+
+ b+=1;
+ }
+}
+
+static inline void vhost_user_send_call(vlib_main_t * vm, vhost_user_vring_t * vq)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ u64 x = 1;
+ int rv __attribute__((unused));
+ /* $$$$ pay attention to rv */
+ rv = write(vq->callfd, &x, sizeof(x));
+ vq->n_since_last_int = 0;
+ vq->int_deadline = vlib_time_now(vm) + vum->coalesce_time;
+}
+
+static u32 vhost_user_if_input ( vlib_main_t * vm,
+ vhost_user_main_t * vum,
+ vhost_user_intf_t * vui,
+ vlib_node_runtime_t * node)
+{
+ vhost_user_vring_t * txvq = &vui->vrings[VHOST_NET_VRING_IDX_TX];
+ vhost_user_vring_t * rxvq = &vui->vrings[VHOST_NET_VRING_IDX_RX];
+ uword n_rx_packets = 0;
+ uword n_left;
+ u32 bi;
+ u32 n_left_to_next, * to_next;
+ u32 next_index = VHOST_USER_RX_NEXT_ETHERNET_INPUT;
+ uword n_rx_bytes = 0;
+ uword n_trace = vlib_get_trace_count (vm, node);
+ u16 qsz_mask;
+ f64 now = vlib_time_now (vm);
+ u32 cpu_index;
+
+ vec_reset_length (vui->d_trace_buffers);
+ u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+
+ /* no descriptor ptr - bail out */
+ if (PREDICT_FALSE(!txvq->desc))
+ return 0;
+
+ /* do we have pending intterupts ? */
+ if ((txvq->n_since_last_int) && (txvq->int_deadline < now))
+ vhost_user_send_call(vm, txvq);
+
+ if ((rxvq->n_since_last_int) && (rxvq->int_deadline < now))
+ vhost_user_send_call(vm, rxvq);
+
+ /* only bit 0 of avail.flags is used so we don't want to deal with this
+ interface if any other bit is set */
+ if (PREDICT_FALSE(txvq->avail->flags & 0xFFFE))
+ return 0;
+
+ /* nothing to do */
+ if (txvq->avail->idx == txvq->last_avail_idx)
+ return 0;
+
+ cpu_index = os_get_cpu_number();
+
+ if (PREDICT_TRUE(txvq->avail->idx > txvq->last_avail_idx))
+ n_left = txvq->avail->idx - txvq->last_avail_idx;
+ else /* wrapped */
+ n_left = (u16) -1 - txvq->last_avail_idx + txvq->avail->idx;
+
+ if (PREDICT_FALSE(!vui->admin_up)) {
+ /* if intf is admin down, just drop all packets waiting in the ring */
+ txvq->last_avail_idx = txvq->last_used_idx = txvq->avail->idx;
+ CLIB_MEMORY_BARRIER();
+ txvq->used->idx = txvq->last_used_idx;
+ vhost_user_send_call(vm, txvq);
+
+ return 0;
+ }
+
+ if (PREDICT_FALSE(n_left > txvq->qsz)) {
+ return 0;
+ }
+
+ if (PREDICT_FALSE(n_left > VLIB_FRAME_SIZE))
+ n_left = VLIB_FRAME_SIZE;
+
+ /* Make sure we have some RX buffers. */
+ {
+ uword l = vec_len (vum->rx_buffers[cpu_index]);
+ uword n_alloc;
+
+ if (l < n_left)
+ {
+ if (! vum->rx_buffers[cpu_index]) {
+ vec_alloc (vum->rx_buffers[cpu_index], 2 * VLIB_FRAME_SIZE );
+ }
+
+ n_alloc = vlib_buffer_alloc_from_free_list
+ (vm, vum->rx_buffers[cpu_index] + l, 2 * VLIB_FRAME_SIZE - l,
+ free_list_index);
+ if (n_alloc == 0)
+ return 0;
+ _vec_len (vum->rx_buffers[cpu_index]) = l + n_alloc;
+ }
+ }
+
+ qsz_mask = txvq->qsz - 1;
+
+ while (n_left > 0) {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left > 0 && n_left_to_next > 0) {
+ vlib_buffer_t * b;
+ u16 desc_chain_head = txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
+ u16 desc_current = desc_chain_head;
+ uword i_rx = vec_len (vum->rx_buffers[cpu_index]) - 1;
+
+ bi = vum->rx_buffers[cpu_index][i_rx];
+ b = vlib_get_buffer (vm, bi);
+
+ vlib_prefetch_buffer_with_index (vm, vum->rx_buffers[cpu_index][i_rx-1], STORE);
+
+ uword offset;
+ if (PREDICT_TRUE(vui->is_any_layout))
+ offset = vui->virtio_net_hdr_sz;
+ else if (!(txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT))
+ /* WSA case, no ANYLAYOUT but single buffer */
+ offset = vui->virtio_net_hdr_sz;
+ else
+ /* CSR case without ANYLAYOUT, skip 1st buffer */
+ offset = txvq->desc[desc_current].len;
+
+ uword ptr=0;
+
+ while(1) {
+ void * buffer_addr = map_guest_mem(vui, txvq->desc[desc_current].addr);
+ CLIB_PREFETCH (&txvq->desc[txvq->desc[desc_current].next], sizeof (vring_desc_t), READ);
+
+#if VHOST_USER_COPY_TX_HDR == 1
+ if (PREDICT_TRUE(offset)) {
+ rte_memcpy(b->pre_data, buffer_addr, sizeof(virtio_net_hdr_t)); /* 12 byte hdr is not used on tx */
+ }
+#endif
+
+ if (txvq->desc[desc_current].len > offset) {
+ u16 len = txvq->desc[desc_current].len - offset;
+
+ if (PREDICT_FALSE(len > VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES))
+ len = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+
+ rte_memcpy(vlib_buffer_get_current (b) + ptr,
+ buffer_addr + offset, len);
+ }
+ ptr += txvq->desc[desc_current].len - offset;
+ offset = 0;
+
+ /* if next flag is set, take next desc in the chain */
+ if (txvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT )
+ desc_current = txvq->desc[desc_current].next;
+ else
+ break;
+ }
+
+ txvq->last_avail_idx++;
+
+ /* returning buffer */
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].id = desc_chain_head;
+ txvq->used->ring[txvq->last_used_idx & qsz_mask].len = ptr + vui->virtio_net_hdr_sz;
+
+ txvq->last_used_idx++;
+
+ b->current_length = ptr;
+
+ if(PREDICT_FALSE(b->current_length < 14)) {
+ vlib_error_count(vm, vhost_user_input_node.index,
+ VHOST_USER_INPUT_FUNC_ERROR_UNDERSIZED_FRAME, 1);
+ goto skip_frame;
+ }
+
+ b->flags = 0;
+ b->current_data = 0;
+ b->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ n_rx_bytes += ptr;
+ _vec_len (vum->rx_buffers[cpu_index]) = i_rx;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See .../vlib/vlib/buffer.h
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b);
+
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vui->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0;
+ b->error = node->errors[0];
+
+ to_next[0] = bi;
+ to_next++;
+ n_left_to_next--;
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi, next_index);
+
+ if (PREDICT_FALSE (n_trace > n_rx_packets))
+ vec_add1 (vui->d_trace_buffers, bi);
+
+ n_rx_packets++;
+skip_frame:
+ n_left--;
+ }
+
+ /* give buffers back to driver */
+ CLIB_MEMORY_BARRIER();
+ txvq->used->idx = txvq->last_used_idx;
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (PREDICT_FALSE (vec_len (vui->d_trace_buffers) > 0))
+ {
+ vhost_user_rx_trace (vm, node, vui, VHOST_NET_VRING_IDX_TX);
+ vlib_set_trace_count (vm, node, n_trace - vec_len (vui->d_trace_buffers));
+ }
+
+ /* if no packets received we're done */
+ if(!n_rx_packets)
+ return 0;
+
+ /* interrupt (call) handling */
+ if((txvq->callfd > 0) && !(txvq->avail->flags & 1)) {
+ txvq->n_since_last_int += n_rx_packets;
+
+ if(txvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call(vm, txvq);
+ }
+
+ /* increase rx counters */
+ vlib_increment_combined_counter
+ (vnet_main.interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ vui->sw_if_index,
+ n_rx_packets, n_rx_bytes);
+
+ return n_rx_packets;
+}
+
+static uword
+vhost_user_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ dpdk_main_t * dm = &dpdk_main;
+ vhost_user_intf_t * vui;
+ uword n_rx_packets = 0;
+ u32 cpu_index = os_get_cpu_number();
+ int i;
+
+ for(i = 0; i < vec_len(vum->vhost_user_interfaces); i++ )
+ {
+ vui = vec_elt_at_index(vum->vhost_user_interfaces, i);
+ if (!vui->is_up ||
+ (i % dm->input_cpu_count) == (cpu_index - dm->input_cpu_first_index))
+ continue;
+ n_rx_packets +=
+ vhost_user_if_input (vm, vum, vui, node);
+ }
+ return n_rx_packets;
+}
+
+VLIB_REGISTER_NODE (vhost_user_input_node) = {
+ .function = vhost_user_input,
+ .type = VLIB_NODE_TYPE_INPUT,
+ .name = "vhost-user-input",
+
+ /* Will be enabled if/when hardware is detected. */
+ .state = VLIB_NODE_STATE_DISABLED,
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_vhost_user_input_trace,
+
+ .n_errors = VHOST_USER_INPUT_FUNC_N_ERROR,
+ .error_strings = vhost_user_input_func_error_strings,
+
+ .n_next_nodes = VHOST_USER_RX_N_NEXT,
+ .next_nodes = {
+ [VHOST_USER_RX_NEXT_DROP] = "error-drop",
+ [VHOST_USER_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ },
+};
+
+static uword
+vhost_user_intfc_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ u32 n_left = 0;
+ u16 used_index;
+ vhost_user_main_t * vum = &vhost_user_main;
+ uword n_packets = 0;
+ uword n_avail_desc;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces, rd->dev_instance);
+ vhost_user_vring_t * rxvq = &vui->vrings[VHOST_NET_VRING_IDX_RX];
+ u16 qsz_mask;
+
+ if (PREDICT_FALSE(!vui->is_up))
+ goto done2;
+
+ if (PREDICT_FALSE(!rxvq->desc))
+ goto done2;
+
+ if (PREDICT_FALSE(vui->lockp != 0))
+ {
+ while (__sync_lock_test_and_set (vui->lockp, 1))
+ ;
+ }
+
+
+ /* only bit 0 of avail.flags is used so we don't want to deal with this
+ interface if any other bit is set */
+ if (PREDICT_FALSE(rxvq->avail->flags & 0xFFFE))
+ goto done2;
+
+ if (PREDICT_FALSE((rxvq->avail->idx == rxvq->last_avail_idx) ||
+ vui->sock_errno != 0)) {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, os_get_cpu_number(),
+ 0, frame->n_vectors);
+
+ vlib_error_count (vm, node->node_index,
+ VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF,
+ frame->n_vectors);
+ goto done2;
+ }
+
+ if (PREDICT_TRUE(rxvq->avail->idx > rxvq->last_avail_idx))
+ n_avail_desc = rxvq->avail->idx - rxvq->last_avail_idx;
+ else /* wrapped */
+ n_avail_desc = (u16) -1 - rxvq->last_avail_idx + rxvq->avail->idx;
+
+ DBG_VQ("rxvq->avail->idx %d rxvq->last_avail_idx %d n_avail_desc %d",
+ rxvq->avail->idx, rxvq->last_avail_idx, n_avail_desc);
+
+ n_left = n_packets = frame->n_vectors;
+ if (PREDICT_FALSE(n_packets > n_avail_desc)) {
+ vlib_simple_counter_main_t * cm;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, os_get_cpu_number(),
+ 0, frame->n_vectors);
+
+ vlib_error_count (vm, node->node_index,
+ VHOST_USER_TX_FUNC_ERROR_PKT_DROP_NOBUF,
+ n_packets - n_avail_desc);
+ n_left = n_packets = n_avail_desc;
+ }
+
+ used_index = rxvq->used->idx;
+ qsz_mask = rxvq->qsz - 1; /* qsz is always power of 2 */
+
+ while (n_left >= 4)
+ {
+ vlib_buffer_t * b0, * b1;
+ u16 desc_chain_head0,desc_chain_head1;
+ u16 desc_current0,desc_current1;
+ uword offset0, offset1;
+ u16 bytes_left0, bytes_left1;
+ void *buffer_addr0, *buffer_addr1;
+
+ vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD);
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ buffers+=2;
+ n_left-=2;
+
+ desc_current0 = desc_chain_head0 = rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+ desc_current1 = desc_chain_head1 = rxvq->avail->ring[(rxvq->last_avail_idx+1) & qsz_mask];
+
+ offset0 = vui->virtio_net_hdr_sz;
+
+ offset1 = vui->virtio_net_hdr_sz;
+
+ bytes_left0 = b0->current_length;
+ bytes_left1 = b1->current_length;
+
+ buffer_addr0 = map_guest_mem(vui, rxvq->desc[desc_current0].addr);
+ buffer_addr1 = map_guest_mem(vui, rxvq->desc[desc_current1].addr);
+
+ if (PREDICT_FALSE(!buffer_addr0)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ if (PREDICT_FALSE(!buffer_addr1)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+
+ virtio_net_hdr_mrg_rxbuf_t * hdr0 = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr0;
+ virtio_net_hdr_mrg_rxbuf_t * hdr1 = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr1;
+ hdr0->hdr.flags = 0;
+ hdr1->hdr.flags = 0;
+ hdr0->hdr.gso_type = 0;
+ hdr1->hdr.gso_type = 0;
+
+ if (vui->virtio_net_hdr_sz == 12) {
+ hdr0->num_buffers = 1;
+ hdr1->num_buffers = 1;
+ }
+
+ buffer_addr0 += offset0;
+ buffer_addr1 += offset1;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current0].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current0].len = vui->virtio_net_hdr_sz;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current1].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current1].len = vui->virtio_net_hdr_sz;
+
+ while(1) {
+ if (rxvq->desc[desc_current0].len - offset0 > 0 ) {
+ u16 bytes_to_copy = bytes_left0 > (rxvq->desc[desc_current0].len - offset0) ? (rxvq->desc[desc_current0].len - offset0) : bytes_left0;
+ rte_memcpy(buffer_addr0, vlib_buffer_get_current (b0) + b0->current_length - bytes_left0, bytes_to_copy);
+ bytes_left0 -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current0].flags & VIRTQ_DESC_F_NEXT ) {
+ offset0 = 0;
+ desc_current0 = rxvq->desc[desc_current1].next;
+ buffer_addr0 = map_guest_mem(vui, rxvq->desc[desc_current0].addr);
+ if (PREDICT_FALSE(!buffer_addr0)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ while(1) {
+ if (rxvq->desc[desc_current1].len - offset1 > 0 ) {
+ u16 bytes_to_copy = bytes_left1 > (rxvq->desc[desc_current1].len - offset1) ? (rxvq->desc[desc_current1].len - offset1) : bytes_left1;
+ rte_memcpy(buffer_addr1, vlib_buffer_get_current (b1) + b1->current_length - bytes_left1, bytes_to_copy);
+ bytes_left1 -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current1].flags & VIRTQ_DESC_F_NEXT ) {
+ offset1 = 0;
+ desc_current1 = rxvq->desc[desc_current1].next;
+ buffer_addr1 = map_guest_mem(vui, rxvq->desc[desc_current1].addr);
+ if (PREDICT_FALSE(!buffer_addr1)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head0;
+ rxvq->used->ring[used_index & qsz_mask].len = b0->current_length + vui->virtio_net_hdr_sz;
+ used_index+=1;
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head1;
+ rxvq->used->ring[used_index & qsz_mask].len = b1->current_length + vui->virtio_net_hdr_sz;
+ used_index+=1;
+ rxvq->last_avail_idx+=2;
+ }
+
+ while (n_left > 0)
+ {
+ vlib_buffer_t * b0;
+ u16 desc_chain_head;
+ u16 desc_current;
+ void *buffer_addr;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers++;
+ n_left--;
+
+ desc_chain_head = rxvq->avail->ring[rxvq->last_avail_idx & qsz_mask];
+ desc_current = desc_chain_head;
+
+ uword offset = vui->virtio_net_hdr_sz;
+
+ u16 bytes_left = b0->current_length;
+ buffer_addr = map_guest_mem(vui, rxvq->desc[desc_current].addr);
+ if (PREDICT_FALSE(!buffer_addr)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+
+ virtio_net_hdr_mrg_rxbuf_t * hdr = (virtio_net_hdr_mrg_rxbuf_t *) buffer_addr;
+ hdr->hdr.flags = 0;
+ hdr->hdr.gso_type = 0;
+
+ if (vui->virtio_net_hdr_sz == 12) {
+ hdr->num_buffers = 1;
+ }
+
+ buffer_addr += offset;
+
+ if (PREDICT_FALSE(!vui->is_any_layout && rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT))
+ rxvq->desc[desc_current].len = vui->virtio_net_hdr_sz;
+
+ while(1) {
+ if (rxvq->desc[desc_current].len - offset > 0 ) {
+ u16 bytes_to_copy = bytes_left > (rxvq->desc[desc_current].len - offset) ? (rxvq->desc[desc_current].len - offset) : bytes_left;
+ rte_memcpy(buffer_addr, vlib_buffer_get_current (b0) + b0->current_length - bytes_left, bytes_to_copy);
+ bytes_left -= bytes_to_copy;
+ }
+
+ if (rxvq->desc[desc_current].flags & VIRTQ_DESC_F_NEXT ) {
+ offset = 0;
+ desc_current = rxvq->desc[desc_current].next;
+ buffer_addr = map_guest_mem(vui, rxvq->desc[desc_current].addr);
+ if (PREDICT_FALSE(!buffer_addr)) {
+ vlib_error_count (vm, node->node_index, VHOST_USER_TX_FUNC_ERROR_MMAP_FAIL, 1);
+ goto done;
+ }
+ }
+ else
+ break;
+ }
+
+ rxvq->used->ring[used_index & qsz_mask].id = desc_chain_head;
+ rxvq->used->ring[used_index & qsz_mask].len = b0->current_length + vui->virtio_net_hdr_sz;
+
+ used_index++;
+ rxvq->last_avail_idx++;
+ }
+
+done:
+ CLIB_MEMORY_BARRIER();
+ rxvq->used->idx = used_index;
+
+ /* interrupt (call) handling */
+ if((rxvq->callfd > 0) && !(rxvq->avail->flags & 1)) {
+ rxvq->n_since_last_int += n_packets - n_left;
+
+ if(rxvq->n_since_last_int > vum->coalesce_frames)
+ vhost_user_send_call(vm, rxvq);
+ }
+
+done2:
+
+ if (PREDICT_FALSE(vui->lockp != 0))
+ *vui->lockp = 0;
+
+ vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
+ return frame->n_vectors;
+}
+
+static clib_error_t *
+vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ vnet_hw_interface_t * hif = vnet_get_hw_interface (vnm, hw_if_index);
+ uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = vec_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance);
+
+ vui->admin_up = is_up;
+
+ if (is_up)
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+
+ return /* no error */ 0;
+}
+
+VNET_DEVICE_CLASS (vhost_user_dev_class,static) = {
+ .name = "vhost-user",
+ .tx_function = vhost_user_intfc_tx,
+ .tx_function_n_errors = VHOST_USER_TX_FUNC_N_ERROR,
+ .tx_function_error_strings = vhost_user_tx_func_error_strings,
+ .format_device_name = format_vhost_user_interface_name,
+ .name_renumber = vhost_user_name_renumber,
+ .admin_up_down_function = vhost_user_interface_admin_up_down,
+};
+
+static uword
+vhost_user_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ struct sockaddr_un sun;
+ int sockfd;
+ unix_file_t template = {0};
+ f64 timeout = 3153600000.0 /* 100 years */;
+ uword *event_data = 0;
+
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ sun.sun_family = AF_UNIX;
+ template.read_function = vhost_user_socket_read;
+ template.error_function = vhost_user_socket_error;
+
+
+ if (sockfd < 0)
+ return 0;
+
+ while (1) {
+ vlib_process_wait_for_event_or_clock (vm, timeout);
+ vlib_process_get_events (vm, &event_data);
+ vec_reset_length (event_data);
+
+ timeout = 3.0;
+
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+
+ if (vui->sock_is_server || !vui->active)
+ continue;
+
+ if (vui->unix_fd == -1) {
+ /* try to connect */
+
+ strncpy(sun.sun_path, (char *) vui->sock_filename, sizeof(sun.sun_path) - 1);
+
+ if (connect(sockfd, (struct sockaddr *) &sun, sizeof(struct sockaddr_un)) == 0) {
+ vui->sock_errno = 0;
+ vui->unix_fd = sockfd;
+ template.file_descriptor = sockfd;
+ vui->unix_file_index = unix_file_add (&unix_main, &template);
+ hash_set (vum->vhost_user_interface_index_by_sock_fd, sockfd, vui - vum->vhost_user_interfaces);
+
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (sockfd < 0)
+ return 0;
+ }
+ else {
+ vui->sock_errno = errno;
+ }
+ } else {
+ /* check if socket is alive */
+ int error = 0;
+ socklen_t len = sizeof (error);
+ int retval = getsockopt(vui->unix_fd, SOL_SOCKET, SO_ERROR, &error, &len);
+
+ if (retval)
+ vhost_user_if_disconnect(vui);
+ }
+ }
+ }
+ return 0;
+}
+
+VLIB_REGISTER_NODE (vhost_user_process_node,static) = {
+ .function = vhost_user_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "vhost-user-process",
+};
+
+int vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ uword *p = NULL;
+ int rv = 0;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sw_if_index,
+ sw_if_index);
+ if (p == 0) {
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+ } else {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+ }
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ vhost_user_if_disconnect(vui);
+ // add to inactive interface list
+ vec_add1 (vum->vhost_user_inactive_interfaces_index, p[0]);
+
+ // reset renumbered iface
+ if (p[0] < vec_len (vum->show_dev_instance_by_real_dev_instance))
+ vum->show_dev_instance_by_real_dev_instance[p[0]] = ~0;
+
+ ethernet_delete_interface (vnm, vui->hw_if_index);
+ DBG_SOCK ("deleted (deactivated) vhost-user interface instance %d", p[0]);
+
+ return rv;
+}
+
+// init server socket on specified sock_filename
+static int vhost_user_init_server_sock(const char * sock_filename, int *sockfd)
+{
+ int rv = 0, len;
+ struct sockaddr_un un;
+ int fd;
+ /* create listening socket */
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+
+ if (fd < 0) {
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ }
+
+ un.sun_family = AF_UNIX;
+ strcpy((char *) un.sun_path, (char *) sock_filename);
+
+ /* remove if exists */
+ unlink( (char *) sock_filename);
+
+ len = strlen((char *) un.sun_path) + strlen((char *) sock_filename);
+
+ if (bind(fd, (struct sockaddr *) &un, len) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ if (listen(fd, 1) == -1) {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ unix_file_t template = {0};
+ template.read_function = vhost_user_socksvr_accept_ready;
+ template.file_descriptor = fd;
+ unix_file_add (&unix_main, &template);
+ *sockfd = fd;
+ return rv;
+
+error:
+ close(fd);
+ return rv;
+}
+
+// get new vhost_user_intf_t from inactive interfaces or create new one
+static vhost_user_intf_t *vhost_user_vui_new()
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = NULL;
+ int inactive_cnt = vec_len(vum->vhost_user_inactive_interfaces_index);
+ // if there are any inactive ifaces
+ if (inactive_cnt > 0) {
+ // take last
+ u32 vui_idx = vum->vhost_user_inactive_interfaces_index[inactive_cnt - 1];
+ if (vec_len(vum->vhost_user_interfaces) > vui_idx) {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, vui_idx);
+ DBG_SOCK("reusing inactive vhost-user interface index %d", vui_idx);
+ }
+ // "remove" from inactive list
+ _vec_len(vum->vhost_user_inactive_interfaces_index) -= 1;
+ }
+
+ // vui was not retrieved from inactive ifaces - create new
+ if (!vui)
+ vec_add2 (vum->vhost_user_interfaces, vui, 1);
+ return vui;
+}
+
+// create ethernet interface for vhost user intf
+static void vhost_user_create_ethernet(vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_t *vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ u8 hwaddr[6];
+ clib_error_t * error;
+
+ /* create hw and sw interface */
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (hwaddr+2, &rnd, sizeof(rnd));
+ hwaddr[0] = 2;
+ hwaddr[1] = 0xfe;
+ }
+
+ error = ethernet_register_interface
+ (vnm,
+ vhost_user_dev_class.index,
+ vui - vum->vhost_user_interfaces /* device instance */,
+ hwaddr /* ethernet address */,
+ &vui->hw_if_index,
+ 0 /* flag change */);
+ if (error)
+ clib_error_report (error);
+}
+
+// initialize vui with specified attributes
+static void vhost_user_vui_init(vnet_main_t * vnm,
+ vhost_user_intf_t *vui, int sockfd,
+ const char * sock_filename,
+ u8 is_server, u64 feature_mask,
+ u32 * sw_if_index)
+{
+ vnet_sw_interface_t * sw;
+ sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index);
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ vui->unix_fd = sockfd;
+ vui->sw_if_index = sw->sw_if_index;
+ vui->num_vrings = 2;
+ vui->sock_is_server = is_server;
+ strncpy(vui->sock_filename, sock_filename, ARRAY_LEN(vui->sock_filename)-1);
+ vui->sock_errno = 0;
+ vui->is_up = 0;
+ vui->feature_mask = feature_mask;
+ vui->active = 1;
+ vui->unix_file_index = ~0;
+
+ vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0);
+
+ if (sw_if_index)
+ *sw_if_index = vui->sw_if_index;
+
+ if (tm->n_vlib_mains > 1)
+ {
+ vui->lockp = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ memset ((void *) vui->lockp, 0, CLIB_CACHE_LINE_BYTES);
+ }
+}
+
+// register vui and start polling on it
+static void vhost_user_vui_register(vlib_main_t * vm, vhost_user_intf_t *vui)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ dpdk_main_t * dm = &dpdk_main;
+ int cpu_index;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ hash_set (vum->vhost_user_interface_index_by_listener_fd, vui->unix_fd,
+ vui - vum->vhost_user_interfaces);
+ hash_set (vum->vhost_user_interface_index_by_sw_if_index, vui->sw_if_index,
+ vui - vum->vhost_user_interfaces);
+
+ /* start polling */
+ cpu_index = dm->input_cpu_first_index +
+ (vui - vum->vhost_user_interfaces) % dm->input_cpu_count;
+
+ if (tm->n_vlib_mains == 1)
+ vlib_node_set_state (vm, vhost_user_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+ else if (!dm->have_io_threads)
+ vlib_node_set_state (vlib_mains[cpu_index], vhost_user_input_node.index,
+ VLIB_NODE_STATE_POLLING);
+
+ /* tell process to start polling for sockets */
+ vlib_process_signal_event(vm, vhost_user_process_node.index, 0, 0);
+}
+
+int vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 * sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ vhost_user_intf_t * vui = NULL;
+ dpdk_main_t * dm = &dpdk_main;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+
+ if (tm->n_vlib_mains > 1 && dm->have_io_threads)
+ {
+ clib_warning("vhost-user interfaces are not supported with multiple io threads");
+ return -1;
+ }
+
+ if (is_server) {
+ if ((rv = vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ vui = vhost_user_vui_new ();
+ ASSERT(vui != NULL);
+
+ vhost_user_create_ethernet (vnm, vm, vui);
+ vhost_user_vui_init (vnm, vui, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ vhost_user_vui_register (vm, vui);
+
+ if (sw_if_index)
+ *sw_if_index = sw_if_idx;
+
+ return rv;
+}
+
+int vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename,
+ u8 is_server,
+ u32 sw_if_index,
+ u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui = NULL;
+ u32 sw_if_idx = ~0;
+ int sockfd = -1;
+ int rv = 0;
+ uword *p = NULL;
+
+ p = hash_get (vum->vhost_user_interface_index_by_sw_if_index,
+ sw_if_index);
+ if (p == 0) {
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+ } else {
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, p[0]);
+ }
+
+ // interface is inactive
+ vui->active = 0;
+ // disconnect interface sockets
+ vhost_user_if_disconnect(vui);
+
+ if (is_server) {
+ if ((rv = vhost_user_init_server_sock (sock_filename, &sockfd)) != 0) {
+ return rv;
+ }
+ }
+
+ vhost_user_vui_init (vnm, vui, sockfd, sock_filename, is_server,
+ feature_mask, &sw_if_idx);
+
+ if (renumber) {
+ vnet_interface_name_renumber (sw_if_idx, custom_dev_instance);
+ }
+
+ vhost_user_vui_register (vm, vui);
+
+ return rv;
+}
+
+clib_error_t *
+vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * sock_filename = NULL;
+ u32 sw_if_index;
+ u8 is_server = 0;
+ u64 feature_mask = (u64)~0;
+ u8 renumber = 0;
+ u32 custom_dev_instance = ~0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "socket %s", &sock_filename))
+ ;
+ else if (unformat (line_input, "server"))
+ is_server = 1;
+ else if (unformat (line_input, "feature-mask 0x%llx", &feature_mask))
+ ;
+ else if (unformat (line_input, "renumber %d", &custom_dev_instance)) {
+ renumber = 1;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ vhost_user_create_if(vnm, vm, (char *)sock_filename,
+ is_server, &sw_if_index, feature_mask,
+ renumber, custom_dev_instance);
+
+ vec_free(sock_filename);
+
+ return 0;
+}
+
+clib_error_t *
+vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 sw_if_index = ~0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "sw_if_index %d", &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ vnet_main_t *vnm = vnet_get_main();
+
+ vhost_user_delete_if(vnm, vm, sw_if_index);
+
+ return 0;
+}
+
+int vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm, vhost_user_intf_details_t **out_vuids)
+{
+ int rv = 0;
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ vhost_user_intf_details_t * r_vuids = NULL;
+ vhost_user_intf_details_t * vuid = NULL;
+ u32 * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ u8 *s = NULL;
+ int i;
+
+ if (!out_vuids)
+ return -1;
+
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+ if (vui->active)
+ vec_add1(hw_if_indices, vui->hw_if_index);
+ }
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+
+ vec_add2(r_vuids, vuid, 1);
+ vuid->sw_if_index = vui->sw_if_index;
+ vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz;
+ vuid->features = vui->features;
+ vuid->is_server = vui->sock_is_server;
+ vuid->num_regions = vui->nregions;
+ vuid->sock_errno = vui->sock_errno;
+ strncpy((char *)vuid->sock_filename, (char *)vui->sock_filename,
+ ARRAY_LEN(vuid->sock_filename)-1);
+
+ s = format (s, "%v%c", hi->name, 0);
+
+ strncpy((char *)vuid->if_name, (char *)s,
+ ARRAY_LEN(vuid->if_name)-1);
+ _vec_len(s) = 0;
+ }
+
+ vec_free (s);
+ vec_free (hw_if_indices);
+
+ *out_vuids = r_vuids;
+
+ return rv;
+}
+
+clib_error_t *
+show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+ u32 hw_if_index, * hw_if_indices = 0;
+ vnet_hw_interface_t * hi;
+ int i, j, q;
+ int show_descr = 0;
+ struct feat_struct { u8 bit; char *str;};
+ struct feat_struct *feat_entry;
+
+ static struct feat_struct feat_array[] = {
+#define _(s,b) { .str = #s, .bit = b, },
+ foreach_virtio_net_feature
+#undef _
+ { .str = NULL }
+ };
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) {
+ vec_add1 (hw_if_indices, hw_if_index);
+ vlib_cli_output(vm, "add %d", hw_if_index);
+ }
+ else if (unformat (input, "descriptors") || unformat (input, "desc") )
+ show_descr = 1;
+ else {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+ if (vec_len (hw_if_indices) == 0) {
+ vec_foreach (vui, vum->vhost_user_interfaces) {
+ if (vui->active)
+ vec_add1(hw_if_indices, vui->hw_if_index);
+ }
+ }
+ vlib_cli_output (vm, "Virtio vhost-user interfaces");
+ vlib_cli_output (vm, "Global:\n coalesce frames %d time %e\n\n",
+ vum->coalesce_frames, vum->coalesce_time);
+
+ for (i = 0; i < vec_len (hw_if_indices); i++) {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vui = vec_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance);
+ vlib_cli_output (vm, "Interface: %s (ifindex %d)",
+ hi->name, hw_if_indices[i]);
+
+ vlib_cli_output (vm, "virtio_net_hdr_sz %d\n features (0x%llx): \n",
+ vui->virtio_net_hdr_sz, vui->features);
+
+ feat_entry = (struct feat_struct *) &feat_array;
+ while(feat_entry->str) {
+ if (vui->features & (1 << feat_entry->bit))
+ vlib_cli_output (vm, " %s (%d)", feat_entry->str, feat_entry->bit);
+ feat_entry++;
+ }
+
+ vlib_cli_output (vm, "\n");
+
+
+ vlib_cli_output (vm, " socket filename %s type %s errno \"%s\"\n\n",
+ vui->sock_filename, vui->sock_is_server ? "server" : "client",
+ strerror(vui->sock_errno));
+
+ vlib_cli_output (vm, " Memory regions (total %d)\n", vui->nregions);
+
+ if (vui->nregions){
+ vlib_cli_output(vm, " region fd guest_phys_addr memory_size userspace_addr mmap_offset mmap_addr\n");
+ vlib_cli_output(vm, " ====== ===== ================== ================== ================== ================== ==================\n");
+ }
+ for (j = 0; j < vui->nregions; j++) {
+ vlib_cli_output(vm, " %d %-5d 0x%016lx 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", j,
+ vui->region_mmap_fd[j],
+ vui->regions[j].guest_phys_addr,
+ vui->regions[j].memory_size,
+ vui->regions[j].userspace_addr,
+ vui->regions[j].mmap_offset,
+ (u64) vui->region_mmap_addr[j]);
+ }
+ for (q = 0; q < vui->num_vrings; q++) {
+ vlib_cli_output(vm, "\n Virtqueue %d\n", q);
+
+ vlib_cli_output(vm, " qsz %d last_avail_idx %d last_used_idx %d\n",
+ vui->vrings[q].qsz,
+ vui->vrings[q].last_avail_idx,
+ vui->vrings[q].last_used_idx);
+
+ if (vui->vrings[q].avail && vui->vrings[q].used)
+ vlib_cli_output(vm, " avail.flags %x avail.idx %d used.flags %x used.idx %d\n",
+ vui->vrings[q].avail->flags,
+ vui->vrings[q].avail->idx,
+ vui->vrings[q].used->flags,
+ vui->vrings[q].used->idx);
+
+ vlib_cli_output(vm, " kickfd %d callfd %d errfd %d\n",
+ vui->vrings[q].kickfd,
+ vui->vrings[q].callfd,
+ vui->vrings[q].errfd);
+
+ if (show_descr) {
+ vlib_cli_output(vm, "\n descriptor table:\n");
+ vlib_cli_output(vm, " id addr len flags next user_addr\n");
+ vlib_cli_output(vm, " ===== ================== ===== ====== ===== ==================\n");
+ for(j = 0; j < vui->vrings[q].qsz; j++) {
+ vlib_cli_output(vm, " %-5d 0x%016lx %-5d 0x%04x %-5d 0x%016lx\n",
+ j,
+ vui->vrings[q].desc[j].addr,
+ vui->vrings[q].desc[j].len,
+ vui->vrings[q].desc[j].flags,
+ vui->vrings[q].desc[j].next,
+ (u64) map_guest_mem(vui, vui->vrings[q].desc[j].addr));}
+ }
+ }
+ vlib_cli_output (vm, "\n");
+ }
+done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+static clib_error_t *
+vhost_user_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "coalesce-frames %d", &vum->coalesce_frames))
+ ;
+ else if (unformat (input, "coalesce-time %f", &vum->coalesce_time))
+ ;
+ else if (unformat (input, "dont-dump-memory"))
+ vum->dont_dump_vhost_user_memory = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return 0;
+}
+
+/* vhost-user { ... } configuration. */
+VLIB_CONFIG_FUNCTION (vhost_user_config, "vhost-user");
+
+void
+vhost_user_unmap_all (void)
+{
+ vhost_user_main_t * vum = &vhost_user_main;
+ vhost_user_intf_t * vui;
+
+ if (vum->dont_dump_vhost_user_memory)
+ {
+ vec_foreach (vui, vum->vhost_user_interfaces)
+ {
+ unmap_all_mem_regions(vui);
+ }
+ }
+}
diff --git a/vnet/vnet/devices/virtio/vhost-user.h b/vnet/vnet/devices/virtio/vhost-user.h
new file mode 100644
index 00000000000..3b57bcbfc16
--- /dev/null
+++ b/vnet/vnet/devices/virtio/vhost-user.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __VIRTIO_VHOST_USER_H__
+#define __VIRTIO_VHOST_USER_H__
+/* vhost-user data structures */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_USER_MSG_HDR_SZ 12
+#define VHOST_VRING_MAX_SIZE 32768
+#define VHOST_NET_VRING_IDX_RX 0
+#define VHOST_NET_VRING_IDX_TX 1
+#define VHOST_NET_VRING_NUM 2
+
+#define VIRTQ_DESC_F_NEXT 1
+
+#define foreach_virtio_net_feature \
+ _ (VIRTIO_NET_F_MRG_RXBUF, 15) \
+ _ (VIRTIO_F_ANY_LAYOUT, 27)
+
+typedef enum {
+#define _(f,n) FEAT_##f = (n),
+ foreach_virtio_net_feature
+#undef _
+} virtio_net_feature_t;
+
+int vhost_user_create_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename, u8 is_server,
+ u32 * sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int vhost_user_modify_if(vnet_main_t * vnm, vlib_main_t * vm,
+ const char * sock_filename, u8 is_server,
+ u32 sw_if_index, u64 feature_mask,
+ u8 renumber, u32 custom_dev_instance);
+int vhost_user_delete_if(vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index);
+
+typedef struct vhost_user_memory_region {
+ u64 guest_phys_addr;
+ u64 memory_size;
+ u64 userspace_addr;
+ u64 mmap_offset;
+} vhost_user_memory_region_t;
+
+typedef struct vhost_user_memory {
+ u32 nregions;
+ u32 padding;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+} vhost_user_memory_t;
+
+typedef struct vhost_vring_state {
+ unsigned int index, num;
+} vhost_vring_state_t;
+
+typedef struct vhost_vring_addr {
+ unsigned int index, flags;
+ u64 desc_user_addr, used_user_addr, avail_user_addr, log_guest_addr;
+} vhost_vring_addr_t;
+
+typedef enum vhost_user_req {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_MAX
+} vhost_user_req_t;
+
+// vring_desc I/O buffer descriptor
+typedef struct {
+ uint64_t addr; // packet data buffer address
+ uint32_t len; // packet data buffer size
+ uint16_t flags; // (see below)
+ uint16_t next; // optional index next descriptor in chain
+} __attribute ((packed)) vring_desc_t;
+
+typedef struct {
+ uint16_t flags;
+ uint16_t idx;
+ uint16_t ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_avail_t;
+
+typedef struct {
+ uint16_t flags;
+ uint16_t idx;
+ struct /* vring_used_elem */ {
+ uint32_t id;
+ uint32_t len;
+ } ring[VHOST_VRING_MAX_SIZE];
+} __attribute ((packed)) vring_used_t;
+
+typedef struct {
+ u8 flags;
+ u8 gso_type;
+ u16 hdr_len;
+ u16 gso_size;
+ u16 csum_start;
+ u16 csum_offset;
+} __attribute ((packed)) virtio_net_hdr_t;
+
+typedef struct {
+ virtio_net_hdr_t hdr;
+ u16 num_buffers;
+} __attribute ((packed)) virtio_net_hdr_mrg_rxbuf_t;
+
+typedef struct vhost_user_msg {
+ vhost_user_req_t request;
+ u32 flags;
+ u32 size;
+ union {
+ u64 u64;
+ vhost_vring_state_t state;
+ vhost_vring_addr_t addr;
+ vhost_user_memory_t memory;
+ };
+} __attribute ((packed)) vhost_user_msg_t;
+
+typedef struct {
+ u32 qsz;
+ u16 last_avail_idx;
+ u16 last_used_idx;
+ vring_desc_t *desc;
+ vring_avail_t *avail;
+ vring_used_t *used;
+ int callfd;
+ int kickfd;
+ int errfd;
+ u32 callfd_idx;
+ u32 n_since_last_int;
+ f64 int_deadline;
+} vhost_user_vring_t;
+
+typedef struct {
+ CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
+ volatile u32 * lockp;
+ u32 is_up;
+ u32 admin_up;
+ u32 unix_fd;
+ u32 unix_file_index;
+ u32 client_fd;
+ char sock_filename[256];
+ int sock_errno;
+ u8 sock_is_server;
+ u32 hw_if_index, sw_if_index;
+ u8 active;
+
+ u32 nregions;
+ u64 features;
+ u64 feature_mask;
+ u32 num_vrings;
+ vhost_user_memory_region_t regions[VHOST_MEMORY_MAX_NREGIONS];
+ void * region_mmap_addr[VHOST_MEMORY_MAX_NREGIONS];
+ u32 region_mmap_fd[VHOST_MEMORY_MAX_NREGIONS];
+ vhost_user_vring_t vrings[2];
+ int virtio_net_hdr_sz;
+ int is_any_layout;
+ u32 * d_trace_buffers;
+} vhost_user_intf_t;
+
+typedef struct {
+ u32 ** rx_buffers;
+ u32 mtu_bytes;
+ vhost_user_intf_t * vhost_user_interfaces;
+ u32 * vhost_user_inactive_interfaces_index;
+ uword * vhost_user_interface_index_by_listener_fd;
+ uword * vhost_user_interface_index_by_sock_fd;
+ uword * vhost_user_interface_index_by_sw_if_index;
+ u32 * show_dev_instance_by_real_dev_instance;
+ u32 coalesce_frames;
+ f64 coalesce_time;
+ int dont_dump_vhost_user_memory;
+} vhost_user_main_t;
+
+typedef struct {
+ u8 if_name[64];
+ u32 sw_if_index;
+ u32 virtio_net_hdr_sz;
+ u64 features;
+ u8 is_server;
+ u8 sock_filename[256];
+ u32 num_regions;
+ int sock_errno;
+} vhost_user_intf_details_t;
+
+int vhost_user_dump_ifs(vnet_main_t * vnm, vlib_main_t * vm,
+ vhost_user_intf_details_t **out_vuids);
+
+// CLI commands to be used from dpdk
+clib_error_t *
+vhost_user_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *
+vhost_user_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+clib_error_t *
+show_vhost_user_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd);
+
+#endif
diff --git a/vnet/vnet/dhcp/client.c b/vnet/vnet/dhcp/client.c
new file mode 100644
index 00000000000..727b5165bad
--- /dev/null
+++ b/vnet/vnet/dhcp/client.c
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/dhcp/proxy.h>
+
+dhcp_client_main_t dhcp_client_main;
+static u8 * format_dhcp_client_state (u8 * s, va_list * va);
+static vlib_node_registration_t dhcp_client_process_node;
+
+void __attribute__((weak))
+api_config_default_ip_route (u8 is_ipv6, u8 is_add, u32 vrf_id,
+ u32 sw_if_index, u8 *next_hop_addr)
+{
+ /* dummy function */
+ return;
+}
+
+static void
+dhcp_client_acquire_address (dhcp_client_main_t * dcm, dhcp_client_t * c)
+{
+ /*
+ * Install any/all info gleaned from dhcp, right here
+ */
+ ip4_add_del_interface_address (dcm->vlib_main, c->sw_if_index,
+ (void *) &c->leased_address,
+ c->subnet_mask_width, 0 /*is_del*/);
+}
+
+static void
+dhcp_client_release_address (dhcp_client_main_t * dcm, dhcp_client_t * c)
+{
+ /*
+ * Remove any/all info gleaned from dhcp, right here. Caller(s)
+ * have not wiped out the info yet.
+ */
+
+ ip4_add_del_interface_address (dcm->vlib_main, c->sw_if_index,
+ (void *) &c->leased_address,
+ c->subnet_mask_width, 1 /*is_del*/);
+}
+
+static void set_l2_rewrite (dhcp_client_main_t * dcm, dhcp_client_t * c)
+{
+ vnet_main_t * vnm = dcm->vnet_main;
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, c->sw_if_index);
+ vnet_hw_interface_class_t * hc =
+ vnet_get_hw_interface_class (vnm, hw->hw_class_index);
+ u32 n_rw;
+
+ /* Acquire the L2 rewrite string for the indicated sw_if_index */
+ vec_validate (c->l2_rewrite, 32);
+ ASSERT (hc->set_rewrite);
+ n_rw = hc->set_rewrite (dcm->vnet_main, c->sw_if_index,
+ VNET_L3_PACKET_TYPE_IP4,
+ 0 /* broadcast */, c->l2_rewrite,
+ vec_len(c->l2_rewrite));
+
+ _vec_len (c->l2_rewrite) = n_rw;
+}
+
+/*
+ * dhcp_client_for_us - server-to-client callback.
+ * Called from proxy_node.c:dhcp_proxy_to_client_input().
+ * This function first decides that the packet in question is
+ * actually for the dhcp client code in case we're also acting as
+ * a dhcp proxy. Ay caramba, what a folly!
+ */
+int dhcp_client_for_us (u32 bi, vlib_buffer_t * b,
+ ip4_header_t * ip,
+ udp_header_t * udp,
+ dhcp_header_t * dhcp)
+{
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ vlib_main_t * vm = dcm->vlib_main;
+ dhcp_client_t * c;
+ uword * p;
+ f64 now = vlib_time_now (dcm->vlib_main);
+ u8 dhcp_message_type = 0;
+ dhcp_option_t * o;
+
+ /*
+ * Doing dhcp client on this interface?
+ * Presumably we will always receive dhcp clnt for-us pkts on
+ * the interface that's asking for an address.
+ */
+ p = hash_get (dcm->client_by_sw_if_index,
+ vnet_buffer(b)->sw_if_index [VLIB_RX]);
+ if (p == 0)
+ return 0; /* no */
+
+ c = pool_elt_at_index (dcm->clients, p[0]);
+
+ /* Mixing dhcp relay and dhcp proxy? DGMS... */
+ if (c->state == DHCP_BOUND && c->retry_count == 0)
+ return 0;
+
+ /* parse through the packet, learn what we can */
+ if (dhcp->your_ip_address.as_u32)
+ c->leased_address.as_u32 = dhcp->your_ip_address.as_u32;
+
+ o = (dhcp_option_t *) dhcp->options;
+
+ while (o->option != 0xFF /* end of options */ &&
+ (u8 *) o < (b->data + b->current_data + b->current_length))
+ {
+ switch (o->option)
+ {
+ case 53: /* dhcp message type */
+ dhcp_message_type = o->data[0];
+ break;
+
+ case 51: /* lease time */
+ {
+ u32 lease_time_in_seconds =
+ clib_host_to_net_u32 (o->data_as_u32[0]);
+ c->lease_expires = now + (f64) lease_time_in_seconds;
+ c->lease_lifetime = lease_time_in_seconds;
+ /* Set a sensible default, in case we don't get opt 58 */
+ c->lease_renewal_interval = lease_time_in_seconds / 2;
+ }
+ break;
+
+ case 58: /* lease renew time in seconds */
+ {
+ u32 lease_renew_time_in_seconds =
+ clib_host_to_net_u32 (o->data_as_u32[0]);
+ c->lease_renewal_interval = lease_renew_time_in_seconds;
+ }
+ break;
+
+ case 54: /* dhcp server address */
+ c->dhcp_server.as_u32 = o->data_as_u32[0];
+ break;
+
+ case 1: /* subnet mask */
+ {
+ u32 subnet_mask =
+ clib_host_to_net_u32 (o->data_as_u32[0]);
+ c->subnet_mask_width = count_set_bits (subnet_mask);
+ }
+ break;
+ case 3: /* router address */
+ {
+ u32 router_address = o->data_as_u32[0];
+ c->router_address.as_u32 = router_address;
+ }
+ break;
+
+ case 12: /* hostname */
+ {
+ /* Replace the existing hostname if necessary */
+ vec_free (c->hostname);
+ vec_validate (c->hostname, o->length - 1);
+ memcpy (c->hostname, o->data, o->length);
+ }
+ break;
+
+ /* $$$$ Your message in this space, parse more options */
+ default:
+ break;
+ }
+
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+
+ switch (c->state)
+ {
+ case DHCP_DISCOVER:
+ if (dhcp_message_type != DHCP_PACKET_OFFER)
+ {
+ clib_warning ("sw_if_index %d state %U message type %d",
+ c->sw_if_index, format_dhcp_client_state,
+ c->state, dhcp_message_type);
+ c->next_transmit = now + 5.0;
+ break;
+ }
+ /* Received an offer, go send a request */
+ c->state = DHCP_REQUEST;
+ c->retry_count = 0;
+ c->next_transmit = 0; /* send right now... */
+ /* Poke the client process, which will send the request */
+ vlib_process_signal_event (vm, dhcp_client_process_node.index,
+ EVENT_DHCP_CLIENT_WAKEUP, c - dcm->clients);
+ break;
+
+ case DHCP_BOUND:
+ case DHCP_REQUEST:
+ if (dhcp_message_type != DHCP_PACKET_ACK)
+ {
+ clib_warning ("sw_if_index %d state %U message type %d",
+ c->sw_if_index, format_dhcp_client_state,
+ c->state, dhcp_message_type);
+ c->next_transmit = now + 5.0;
+ break;
+ }
+ /* OK, we own the address (etc), add to the routing table(s) */
+ if (c->state == DHCP_REQUEST)
+ {
+ void (*fp)(u32, u32, u8 *, u8, u8 *, u8 *, u8 *) = c->event_callback;
+
+ dhcp_client_acquire_address (dcm, c);
+
+ /*
+ * Configure default IP route:
+ * - vrf_id is 0 by default.
+ */
+ if (c->router_address.as_u32)
+ api_config_default_ip_route (0 /* is_ipv6 */,
+ 1 /* is_add */,
+ 0 /* vrf_id */,
+ c->sw_if_index,
+ (u8 *)&c->router_address);
+
+ /*
+ * Call the user's event callback to report DHCP information
+ */
+ if (fp)
+ (*fp) (c->client_index, /* clinet index */
+ c->pid,
+ c->hostname,
+ 0, /* is_ipv6 */
+ (u8 *)&c->leased_address, /* host IP address */
+ (u8 *)&c->router_address, /* router IP address */
+ (u8 *)(c->l2_rewrite + 6));/* host MAC address */
+ }
+
+ c->state = DHCP_BOUND;
+ c->retry_count = 0;
+ c->next_transmit = now + (f64) c->lease_renewal_interval;
+ c->lease_expires = now + (f64) c->lease_lifetime;
+ break;
+
+ default:
+ clib_warning ("client %d bogus state %d",
+ c - dcm->clients, c->state);
+ break;
+ }
+
+ /* drop the pkt, return 1 */
+ vlib_buffer_free (vm, &bi, 1);
+ return 1;
+}
+
+static void
+send_dhcp_pkt (dhcp_client_main_t * dcm, dhcp_client_t * c,
+ dhcp_packet_type_t type, int is_broadcast)
+{
+ vlib_main_t * vm = dcm->vlib_main;
+ vnet_main_t * vnm = dcm->vnet_main;
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, c->sw_if_index);
+ vnet_sw_interface_t * sup_sw
+ = vnet_get_sup_sw_interface (vnm, c->sw_if_index);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, c->sw_if_index);
+ vlib_buffer_t * b;
+ u32 bi;
+ ip4_header_t * ip;
+ udp_header_t * udp;
+ dhcp_header_t * dhcp;
+ u32 * to_next;
+ vlib_frame_t * f;
+ dhcp_option_t * o;
+ u16 udp_length, ip_length;
+
+ /* Interface(s) down? */
+ if ((hw->flags & VNET_HW_INTERFACE_FLAG_LINK_UP) == 0)
+ return;
+ if ((sup_sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) == 0)
+ return;
+ if ((sw->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) == 0)
+ return;
+
+ if (vlib_buffer_alloc (vm, &bi, 1) != 1) {
+ clib_warning ("buffer allocation failure");
+ c->next_transmit = 0;
+ return;
+ }
+
+ /* Build a dhcpv4 pkt from whole cloth */
+ b = vlib_get_buffer (vm, bi);
+
+ ASSERT (b->current_data == 0);
+
+ vnet_buffer(b)->sw_if_index[VLIB_RX] = c->sw_if_index;
+ if (is_broadcast)
+ {
+ f = vlib_get_frame_to_node (vm, hw->output_node_index);
+ vnet_buffer(b)->sw_if_index[VLIB_TX] = c->sw_if_index;
+ memcpy (b->data, c->l2_rewrite, vec_len(c->l2_rewrite));
+ ip = (void *)
+ (((u8 *)vlib_buffer_get_current (b)) + vec_len (c->l2_rewrite));
+ }
+ else
+ {
+ f = vlib_get_frame_to_node (vm, ip4_lookup_node.index);
+ vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0; /* use interface VRF */
+ ip = vlib_buffer_get_current (b);
+ }
+
+ /* Enqueue the packet right now */
+ to_next = vlib_frame_vector_args (f);
+ to_next[0] = bi;
+ f->n_vectors = 1;
+
+ if (is_broadcast)
+ vlib_put_frame_to_node (vm, hw->output_node_index, f);
+ else
+ vlib_put_frame_to_node (vm, ip4_lookup_node.index, f);
+
+ udp = (udp_header_t *)(ip+1);
+ dhcp = (dhcp_header_t *)(udp+1);
+
+ /* $$$ optimize, maybe */
+ memset (ip, 0, sizeof (*ip) + sizeof (*udp) + sizeof (*dhcp));
+
+ ip->ip_version_and_header_length = 0x45;
+ ip->ttl = 128;
+ ip->protocol = IP_PROTOCOL_UDP;
+
+ if (is_broadcast)
+ {
+ /* src = 0.0.0.0, dst = 255.255.255.255 */
+ ip->dst_address.as_u32 = ~0;
+ }
+ else
+ {
+ /* Renewing an active lease, plain old ip4 src/dst */
+ ip->src_address.as_u32 = c->leased_address.as_u32;
+ ip->dst_address.as_u32 = c->dhcp_server.as_u32;
+ }
+
+ udp->src_port = clib_host_to_net_u16 (UDP_DST_PORT_dhcp_to_client);
+ udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_dhcp_to_server);
+
+ /* Send the interface MAC address */
+ memcpy (dhcp->client_hardware_address, c->l2_rewrite + 6, 6);
+
+ /* Lease renewal, set up client_ip_address */
+ if (is_broadcast == 0)
+ dhcp->client_ip_address.as_u32 = c->leased_address.as_u32;
+
+ dhcp->opcode = 1; /* request, all we send */
+ dhcp->hardware_type = 1; /* ethernet */
+ dhcp->hardware_address_length = 6;
+ dhcp->transaction_identifier = c->transaction_id;
+ dhcp->flags = clib_host_to_net_u16(is_broadcast ? DHCP_FLAG_BROADCAST : 0);
+ dhcp->magic_cookie.as_u32 = DHCP_MAGIC;
+
+ o = (dhcp_option_t * )dhcp->options;
+
+ /* Send option 53, the DHCP message type */
+ o->option = 53;
+ o->length = 1;
+ o->data[0] = type;
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+
+ /* Send option 57, max msg length */
+ if (0 /* not needed, apparently */)
+ {
+ o->option = 57;
+ o->length = 2;
+ {
+ u16 *o2 = (u16 *) o->data;
+ *o2 = clib_host_to_net_u16 (1152);
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+ }
+
+ /*
+ * If server ip address is available with non-zero value,
+ * option 54 (DHCP Server Identifier) is sent.
+ */
+ if (c->dhcp_server.as_u32)
+ {
+ o->option = 54;
+ o->length = 4;
+ memcpy (o->data, &c->dhcp_server.as_u32, 4);
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+
+ /* send option 50, requested IP address */
+ if (c->leased_address.as_u32)
+ {
+ o->option = 50;
+ o->length = 4;
+ memcpy (o->data, &c->leased_address.as_u32, 4);
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+
+ /* send option 12, host name */
+ if (vec_len (c->hostname))
+ {
+ o->option = 12;
+ o->length = vec_len (c->hostname);
+ memcpy (o->data, c->hostname, vec_len (c->hostname));
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+
+ /* $$ maybe send the client s/w version if anyone cares */
+
+ /*
+ * send option 55, parameter request list
+ * The current list - see below, matches the Linux dhcp client's list
+ * Any specific dhcp server config and/or dhcp server may or may
+ * not yield specific options.
+ */
+ o->option = 55;
+ o->length = vec_len (c->option_55_data);
+ memcpy (o->data, c->option_55_data, vec_len(c->option_55_data));
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+
+ /* End of list */
+ o->option = 0xff;
+ o->length = 0;
+ o++;
+
+ b->current_length = ((u8 *)o) - b->data;
+
+ /* fix ip length, checksum and udp length */
+ ip_length = vlib_buffer_length_in_chain (vm, b);
+ if (is_broadcast)
+ ip_length -= vec_len (c->l2_rewrite);
+
+ ip->length = clib_host_to_net_u16(ip_length);
+ ip->checksum = ip4_header_checksum(ip);
+
+ udp_length = ip_length - (sizeof (*ip));
+ udp->length = clib_host_to_net_u16 (udp_length);
+}
+
+static int
+dhcp_discover_state (dhcp_client_main_t * dcm, dhcp_client_t * c, f64 now)
+{
+ /*
+ * State machine "DISCOVER" state. Send a dhcp discover packet,
+ * eventually back off the retry rate.
+ */
+ send_dhcp_pkt (dcm, c, DHCP_PACKET_DISCOVER, 1 /* is_broadcast */);
+
+ c->retry_count++;
+ if (c->retry_count > 10)
+ c->next_transmit = now + 5.0;
+ else
+ c->next_transmit = now + 1.0;
+ return 0;
+}
+
+static int
+dhcp_request_state (dhcp_client_main_t * dcm, dhcp_client_t * c, f64 now)
+{
+ /*
+ * State machine "REQUEST" state. Send a dhcp request packet,
+ * eventually drop back to the discover state.
+ */
+ send_dhcp_pkt (dcm, c, DHCP_PACKET_REQUEST, 1 /* is_broadcast */);
+
+ c->retry_count++;
+ if (c->retry_count > 7 /* lucky you */)
+ {
+ c->state = DHCP_DISCOVER;
+ c->next_transmit = now;
+ c->retry_count = 0;
+ return 1;
+ }
+ c->next_transmit = now + 1.0;
+ return 0;
+}
+
+static int
+dhcp_bound_state (dhcp_client_main_t * dcm, dhcp_client_t * c, f64 now)
+{
+ /*
+ * State machine "BOUND" state. Send a dhcp request packet,
+ * eventually, when the lease expires, forget the dhcp data
+ * and go back to the stone age.
+ */
+ send_dhcp_pkt (dcm, c, DHCP_PACKET_REQUEST, 0 /* is_broadcast */);
+
+ c->retry_count++;
+ if (c->retry_count > 10)
+ c->next_transmit = now + 5.0;
+ else
+ c->next_transmit = now + 1.0;
+
+ if (now > c->lease_expires)
+ {
+ if (c->router_address.as_u32)
+ api_config_default_ip_route (0 /* is_ipv6 */,
+ 0 /* is_add */,
+ 0 /* vrf_id */,
+ c->sw_if_index,
+ (u8 *)&c->router_address);
+
+ dhcp_client_release_address (dcm, c);
+ c->state = DHCP_DISCOVER;
+ c->next_transmit = now;
+ c->retry_count = 0;
+ /* Wipe out any memory of the address we had... */
+ c->leased_address.as_u32 = 0;
+ c->subnet_mask_width = 0;
+ c->router_address.as_u32 = 0;
+ c->lease_renewal_interval = 0;
+ c->dhcp_server.as_u32 = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static f64 dhcp_client_sm (f64 now, f64 timeout, uword pool_index)
+{
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ dhcp_client_t * c;
+
+ /* deleted, pooched, yadda yadda yadda */
+ if (pool_is_free_index (dcm->clients, pool_index))
+ return timeout;
+
+ c = pool_elt_at_index (dcm->clients, pool_index);
+
+ /* Time for us to do something with this client? */
+ if (now < c->next_transmit)
+ return timeout;
+
+ again:
+ switch (c->state)
+ {
+ case DHCP_DISCOVER: /* send a discover */
+ if (dhcp_discover_state (dcm, c, now))
+ goto again;
+ break;
+
+ case DHCP_REQUEST: /* send a request */
+ if (dhcp_request_state (dcm, c, now))
+ goto again;
+ break;
+
+ case DHCP_BOUND: /* bound, renew needed? */
+ if (dhcp_bound_state (dcm, c, now))
+ goto again;
+ break;
+
+ default:
+ clib_warning ("dhcp client %d bogus state %d",
+ c - dcm->clients, c->state);
+ break;
+ }
+
+ if (c->next_transmit < now + timeout)
+ return c->next_transmit - now;
+
+ return timeout;
+}
+
+static uword
+dhcp_client_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ f64 timeout = 100.0;
+ f64 now;
+ uword event_type;
+ uword * event_data = 0;
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ dhcp_client_t * c;
+ int i;
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, timeout);
+
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ now = vlib_time_now (vm);
+
+ switch (event_type)
+ {
+ case EVENT_DHCP_CLIENT_WAKEUP:
+ for (i = 0; i < vec_len (event_data); i++)
+ timeout = dhcp_client_sm (now, timeout, event_data[i]);
+ break;
+
+ case ~0:
+ pool_foreach (c, dcm->clients,
+ ({
+ timeout = dhcp_client_sm (now, timeout,
+ (uword)(c - dcm->clients));
+ }));
+ if (pool_elts (dcm->clients) == 0)
+ timeout = 100.0;
+ break;
+ }
+
+ vec_reset_length (event_data);
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+
+VLIB_REGISTER_NODE (dhcp_client_process_node,static) = {
+ .function = dhcp_client_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "dhcp-client-process",
+ .process_log2_n_stack_bytes = 16,
+};
+
+static u8 * format_dhcp_client_state (u8 * s, va_list * va)
+{
+ dhcp_client_state_t state = va_arg (*va, dhcp_client_state_t);
+ char * str = "BOGUS!";
+
+ switch (state)
+ {
+#define _(a) \
+ case a: \
+ str = #a; \
+ break;
+ foreach_dhcp_client_state;
+#undef _
+ default:
+ break;
+ }
+
+ s = format (s, "%s", str);
+ return s;
+}
+
+static u8 * format_dhcp_client (u8 * s, va_list * va)
+{
+ dhcp_client_main_t * dcm = va_arg (*va, dhcp_client_main_t *);
+ dhcp_client_t * c = va_arg (*va, dhcp_client_t *);
+ int verbose = va_arg (*va, int);
+
+ s = format (s, "[%d] %U state %U ", c - dcm->clients,
+ format_vnet_sw_if_index_name, dcm->vnet_main, c->sw_if_index,
+ format_dhcp_client_state, c->state);
+
+ if (c->leased_address.as_u32)
+ s = format (s, "addr %U/%d gw %U\n",
+ format_ip4_address, &c->leased_address,
+ c->subnet_mask_width, format_ip4_address, &c->router_address);
+ else
+ s = format (s, "no address\n");
+
+ if (verbose)
+ {
+ s = format (s, "retry count %d, next xmt %.2f",
+ c->retry_count, c->next_transmit);
+ }
+ return s;
+}
+
+static clib_error_t *
+show_dhcp_client_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ dhcp_client_t * c;
+ int verbose = 0;
+ u32 sw_if_index = ~0;
+ uword * p;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dcm->vnet_main,
+ &sw_if_index))
+ ;
+ else if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ break;
+ }
+
+ if (sw_if_index != ~0)
+ {
+ p = hash_get (dcm->client_by_sw_if_index, sw_if_index);
+ if (p == 0)
+ return clib_error_return (0, "dhcp client not configured");
+ c = pool_elt_at_index (dcm->clients, sw_if_index);
+ vlib_cli_output (vm, "%U", format_dhcp_client, dcm, c, verbose);
+ return 0;
+ }
+
+ pool_foreach (c, dcm->clients,
+ ({
+ vlib_cli_output (vm, "%U", format_dhcp_client, dcm, c, verbose);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_dhcp_client_command, static) = {
+ .path = "show dhcp client",
+ .short_help = "show dhcp client [intfc <intfc>][verbose]",
+ .function = show_dhcp_client_command_fn,
+};
+
+
+int dhcp_client_add_del (dhcp_client_add_del_args_t * a)
+{
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ vlib_main_t * vm = dcm->vlib_main;
+ dhcp_client_t * c;
+ uword * p;
+
+ p = hash_get (dcm->client_by_sw_if_index, a->sw_if_index);
+
+ if ((p && a->is_add) || (!p && a->is_add == 0))
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ if (a->is_add)
+ {
+ pool_get (dcm->clients, c);
+ memset (c, 0, sizeof (*c));
+ c->state = DHCP_DISCOVER;
+ c->sw_if_index = a->sw_if_index;
+ c->client_index = a->client_index;
+ c->pid = a->pid;
+ c->event_callback = a->event_callback;
+ c->option_55_data = a->option_55_data;
+ c->hostname = a->hostname;
+ c->client_identifier = a->client_identifier;
+ do {
+ c->transaction_id = random_u32 (&dcm->seed);
+ } while (c->transaction_id == 0);
+ set_l2_rewrite (dcm, c);
+ hash_set (dcm->client_by_sw_if_index, a->sw_if_index, c - dcm->clients);
+ vlib_process_signal_event (vm, dhcp_client_process_node.index,
+ EVENT_DHCP_CLIENT_WAKEUP, c - dcm->clients);
+ }
+ else
+ {
+ c = pool_elt_at_index (dcm->clients, p[0]);
+
+ if (c->router_address.as_u32)
+ api_config_default_ip_route (0 /* is_ipv6 */,
+ 0 /* is_add */,
+ 0 /* vrf_id */,
+ c->sw_if_index,
+ (u8 *)&c->router_address);
+ vec_free (c->option_55_data);
+ vec_free (c->hostname);
+ vec_free (c->client_identifier);
+ vec_free (c->l2_rewrite);
+ hash_unset (dcm->client_by_sw_if_index, c->sw_if_index);
+ pool_put (dcm->clients, c);
+ }
+ return 0;
+}
+
+int
+dhcp_client_config (vlib_main_t * vm,
+ u32 sw_if_index,
+ u8 * hostname,
+ u32 is_add,
+ u32 client_index,
+ void * event_callback,
+ u32 pid)
+{
+ dhcp_client_add_del_args_t _a, *a = &_a;
+ int rv;
+
+ memset (a, 0, sizeof (*a));
+ a->is_add = is_add;
+ a->sw_if_index = sw_if_index;
+ a->client_index = client_index;
+ a->pid = pid;
+ a->event_callback = event_callback;
+ vec_validate(a->hostname, strlen((char *)hostname) - 1);
+ strncpy((char *)a->hostname, (char *)hostname, vec_len(a->hostname));
+ a->client_identifier = format (0, "vpe 1.0%c", 0);
+ /*
+ * Option 55 request list. These data precisely match
+ * the Ubuntu dhcp client. YMMV.
+ */
+
+ /* Subnet Mask */
+ vec_add1 (a->option_55_data, 1);
+ /* Broadcast address */
+ vec_add1 (a->option_55_data, 28);
+ /* time offset */
+ vec_add1 (a->option_55_data, 2);
+ /* Router */
+ vec_add1 (a->option_55_data, 3);
+ /* Domain Name */
+ vec_add1 (a->option_55_data, 15);
+ /* DNS */
+ vec_add1 (a->option_55_data, 6);
+ /* Domain search */
+ vec_add1 (a->option_55_data, 119);
+ /* Host name */
+ vec_add1 (a->option_55_data, 12);
+ /* NetBIOS name server */
+ vec_add1 (a->option_55_data, 44);
+ /* NetBIOS Scope */
+ vec_add1 (a->option_55_data, 47);
+ /* MTU */
+ vec_add1 (a->option_55_data, 26);
+ /* Classless static route */
+ vec_add1 (a->option_55_data, 121);
+ /* NTP servers */
+ vec_add1 (a->option_55_data, 42);
+
+ rv = dhcp_client_add_del (a);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_VALUE:
+
+ vec_free (a->hostname);
+ vec_free (a->client_identifier);
+ vec_free (a->option_55_data);
+
+ if (is_add)
+ clib_warning ("dhcp client already enabled on intf_idx %d",
+ sw_if_index);
+ else
+ clib_warning ("dhcp client not enabled on on intf_idx %d",
+ sw_if_index);
+ break;
+
+ default:
+ clib_warning ("dhcp_client_add_del returned %d", rv);
+ }
+
+ return rv;
+}
+
+static clib_error_t *
+dhcp_client_set_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+ u32 sw_if_index;
+ u8 * hostname = 0;
+ u8 sw_if_index_set = 0;
+ int is_add = 1;
+ dhcp_client_add_del_args_t _a, *a = &_a;
+ int rv;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, dcm->vnet_main,
+ &sw_if_index))
+ sw_if_index_set = 1;
+ else if (unformat (input, "hostname %v", &hostname))
+ ;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else
+ break;
+ }
+
+ if (sw_if_index_set == 0)
+ return clib_error_return (0, "interface not specified");
+
+ memset (a, 0, sizeof (*a));
+ a->is_add = is_add;
+ a->sw_if_index = sw_if_index;
+ a->hostname = hostname;
+ a->client_identifier = format (0, "vpe 1.0%c", 0);
+
+ /*
+ * Option 55 request list. These data precisely match
+ * the Ubuntu dhcp client. YMMV.
+ */
+
+ /* Subnet Mask */
+ vec_add1 (a->option_55_data, 1);
+ /* Broadcast address */
+ vec_add1 (a->option_55_data, 28);
+ /* time offset */
+ vec_add1 (a->option_55_data, 2);
+ /* Router */
+ vec_add1 (a->option_55_data, 3);
+ /* Domain Name */
+ vec_add1 (a->option_55_data, 15);
+ /* DNS */
+ vec_add1 (a->option_55_data, 6);
+ /* Domain search */
+ vec_add1 (a->option_55_data, 119);
+ /* Host name */
+ vec_add1 (a->option_55_data, 12);
+ /* NetBIOS name server */
+ vec_add1 (a->option_55_data, 44);
+ /* NetBIOS Scope */
+ vec_add1 (a->option_55_data, 47);
+ /* MTU */
+ vec_add1 (a->option_55_data, 26);
+ /* Classless static route */
+ vec_add1 (a->option_55_data, 121);
+ /* NTP servers */
+ vec_add1 (a->option_55_data, 42);
+
+ rv = dhcp_client_add_del (a);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_VALUE:
+
+ vec_free (a->hostname);
+ vec_free (a->client_identifier);
+ vec_free (a->option_55_data);
+ if (is_add)
+ return clib_error_return (0, "dhcp client already enabled on %U",
+ format_vnet_sw_if_index_name,
+ dcm->vnet_main, sw_if_index);
+ else
+ return clib_error_return (0, "dhcp client not enabled on %U",
+ format_vnet_sw_if_index_name,
+ dcm->vnet_main, sw_if_index);
+ break;
+
+ default:
+ vlib_cli_output (vm, "dhcp_client_add_del returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcp_client_set_command, static) = {
+ .path = "set dhcp client",
+ .short_help = "set dhcp client [del] intfc <interface> [hostname <name>]",
+ .function = dhcp_client_set_command_fn,
+};
+
+static clib_error_t *
+dhcp_client_init (vlib_main_t * vm)
+{
+ dhcp_client_main_t * dcm = &dhcp_client_main;
+
+ dcm->vlib_main = vm;
+ dcm->vnet_main = vnet_get_main();
+ dcm->seed = 0xdeaddabe;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dhcp_client_init);
diff --git a/vnet/vnet/dhcp/client.h b/vnet/vnet/dhcp/client.h
new file mode 100644
index 00000000000..d15e686b636
--- /dev/null
+++ b/vnet/vnet/dhcp/client.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * client.h: dhcp client
+ */
+
+#ifndef included_dhcp_client_h
+#define included_dhcp_client_h
+
+#define foreach_dhcp_client_state \
+_(DHCP_DISCOVER) \
+_(DHCP_REQUEST) \
+_(DHCP_BOUND)
+
+typedef enum {
+#define _(a) a,
+ foreach_dhcp_client_state
+#undef _
+} dhcp_client_state_t;
+
+typedef struct {
+ dhcp_client_state_t state;
+
+ /* the interface in question */
+ u32 sw_if_index;
+
+ /* State machine retry counter */
+ u32 retry_count;
+
+ /* Send next pkt at this time */
+ f64 next_transmit;
+ f64 lease_expires;
+
+ /* DHCP transaction ID, a random number */
+ u32 transaction_id;
+
+ /* leased address, other learned info DHCP */
+ ip4_address_t leased_address; /* from your_ip_address field */
+ ip4_address_t dhcp_server;
+ u32 subnet_mask_width; /* option 1 */
+ ip4_address_t router_address; /* option 3 */
+ u32 lease_renewal_interval; /* option 51 */
+ u32 lease_lifetime; /* option 59 */
+
+ /* Requested data (option 55) */
+ u8 * option_55_data;
+
+ u8 * l2_rewrite;
+
+ /* hostname and software client identifiers */
+ u8 * hostname;
+ u8 * client_identifier; /* software version, e.g. vpe 1.0*/
+
+ /* Information used for event callback */
+ u32 client_index;
+ u32 pid;
+ void * event_callback;
+} dhcp_client_t;
+
+typedef struct {
+ /* DHCP client pool */
+ dhcp_client_t * clients;
+ uword * client_by_sw_if_index;
+ u32 seed;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} dhcp_client_main_t;
+
+typedef struct {
+ int is_add;
+ u32 sw_if_index;
+
+ /* vectors, consumed by dhcp client code */
+ u8 * hostname;
+ u8 * client_identifier;
+
+ /* Bytes containing requested option numbers */
+ u8 * option_55_data;
+
+ /* Information used for event callback */
+ u32 client_index;
+ u32 pid;
+ void * event_callback;
+} dhcp_client_add_del_args_t;
+
+dhcp_client_main_t dhcp_client_main;
+
+#define EVENT_DHCP_CLIENT_WAKEUP 1
+
+int dhcp_client_for_us (u32 bi0,
+ vlib_buffer_t * b0,
+ ip4_header_t * ip0,
+ udp_header_t * u0,
+ dhcp_header_t * dh0);
+
+int dhcp_client_config (vlib_main_t * vm,
+ u32 sw_if_index,
+ u8 * hostname,
+ u32 is_add,
+ u32 client_index,
+ void *event_callback,
+ u32 pid);
+
+#endif /* included_dhcp_client_h */
diff --git a/vnet/vnet/dhcp/packet.h b/vnet/vnet/dhcp/packet.h
new file mode 100644
index 00000000000..267a8eafc93
--- /dev/null
+++ b/vnet/vnet/dhcp/packet.h
@@ -0,0 +1,61 @@
+#ifndef included_vnet_dhcp_packet_h
+#define included_vnet_dhcp_packet_h
+
+/*
+ * DHCP packet format
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip4_packet.h>
+
+typedef struct {
+ u8 opcode; /* 1 = request, 2 = reply */
+ u8 hardware_type; /* 1 = ethernet */
+ u8 hardware_address_length;
+ u8 hops;
+ u32 transaction_identifier;
+ u16 seconds;
+ u16 flags;
+#define DHCP_FLAG_BROADCAST (1<<15)
+ ip4_address_t client_ip_address;
+ ip4_address_t your_ip_address; /* use this one */
+ ip4_address_t server_ip_address;
+ ip4_address_t gateway_ip_address; /* use option 3, not this one */
+ u8 client_hardware_address[16];
+ u8 server_name[64];
+ u8 boot_filename[128];
+ ip4_address_t magic_cookie;
+ u8 options[0];
+} dhcp_header_t;
+
+typedef struct {
+ u8 option;
+ u8 length;
+ union {
+ u8 data[0];
+ u32 data_as_u32[0];
+ };
+} __attribute__((packed)) dhcp_option_t;
+
+typedef enum {
+ DHCP_PACKET_DISCOVER=1,
+ DHCP_PACKET_OFFER,
+ DHCP_PACKET_REQUEST,
+ DHCP_PACKET_ACK=5,
+} dhcp_packet_type_t;
+
+/* charming antique: 99.130.83.99 is the dhcp magic cookie */
+#define DHCP_MAGIC (clib_host_to_net_u32(0x63825363))
+
+#endif /* included_vnet_dhcp_packet_h */
diff --git a/vnet/vnet/dhcp/proxy.h b/vnet/vnet/dhcp/proxy.h
new file mode 100644
index 00000000000..e12c0d001b5
--- /dev/null
+++ b/vnet/vnet/dhcp/proxy.h
@@ -0,0 +1,92 @@
+/*
+ * proxy.h: dhcp proxy
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_dhcp_proxy_h
+#define included_dhcp_proxy_h
+
+#include <vnet/vnet.h>
+#include <vnet/dhcp/packet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/udp.h>
+#include <vnet/dhcp/client.h>
+
+typedef enum {
+#define dhcp_proxy_error(n,s) DHCP_PROXY_ERROR_##n,
+#include <vnet/dhcp/proxy_error.def>
+#undef dhcp_proxy_error
+ DHCP_PROXY_N_ERROR,
+} dhcp_proxy_error_t;
+
+typedef struct {
+ u32 oui;
+ u32 fib_id;
+} vss_id;
+
+typedef union {
+ u8 as_u8[8];
+ vss_id vpn_id;
+} vss_info;
+
+typedef struct {
+ ip4_address_t dhcp_server;
+ ip4_address_t dhcp_src_address;
+ u32 insert_option_82;
+ u32 server_fib_index;
+ u32 valid;
+} dhcp_server_t;
+
+typedef struct {
+ /* Pool of DHCP servers */
+ dhcp_server_t * dhcp_servers;
+
+ /* Pool of selected DHCP server. Zero is the default server */
+ u32 * dhcp_server_index_by_rx_fib_index;
+
+ /* to drop pkts in server-to-client direction */
+ u32 error_drop_node_index;
+
+ vss_info *opt82vss;
+
+ /* hash lookup specific vrf_id -> option 82 vss suboption */
+ uword * opt82vss_index_by_vrf_id;
+
+ /* convenience */
+ dhcp_client_main_t * dhcp_client_main;
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} dhcp_proxy_main_t;
+
+dhcp_proxy_main_t dhcp_proxy_main;
+
+int dhcp_proxy_set_server (ip4_address_t *addr, ip4_address_t *src_address,
+ u32 fib_id, int insert_option_82, int is_del);
+
+int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
+ u32 rx_fib_id,
+ u32 server_fib_id,
+ int insert_option_82, int is_del);
+
+int dhcp_proxy_set_option82_vss(u32 vrf_id,
+ u32 oui,
+ u32 fib_id,
+ int is_del);
+#endif /* included_dhcp_proxy_h */
diff --git a/vnet/vnet/dhcp/proxy_error.def b/vnet/vnet/dhcp/proxy_error.def
new file mode 100644
index 00000000000..6aa06eb5120
--- /dev/null
+++ b/vnet/vnet/dhcp/proxy_error.def
@@ -0,0 +1,30 @@
+/*
+ * dhcp_proxy_error.def: dhcp proxy errors
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+dhcp_proxy_error (NONE, "no error")
+dhcp_proxy_error (NO_SERVER, "no dhcp server configured")
+dhcp_proxy_error (RELAY_TO_SERVER, "DHCP packets relayed to the server")
+dhcp_proxy_error (RELAY_TO_CLIENT, "DHCP packets relayed to clients")
+dhcp_proxy_error (OPTION_82_ERROR, "DHCP failed to insert option 82")
+dhcp_proxy_error (NO_OPTION_82, "DHCP option 82 missing")
+dhcp_proxy_error (BAD_OPTION_82, "Bad DHCP option 82 value")
+dhcp_proxy_error (BAD_FIB_ID, "DHCP option 82 fib-id to fib-index map failure")
+dhcp_proxy_error (NO_INTERFACE_ADDRESS, "DHCP no interface address")
+dhcp_proxy_error (OPTION_82_VSS_NOT_PROCESSED, "DHCP VSS not processed by DHCP server")
+dhcp_proxy_error (BAD_YIADDR, "DHCP packets with bad your_ip_address fields")
+dhcp_proxy_error (BAD_SVR_FIB_OR_ADDRESS, "DHCP packets not from DHCP server or server FIB.")
+dhcp_proxy_error (PKT_TOO_BIG, "DHCP packets which are too big.")
diff --git a/vnet/vnet/dhcp/proxy_node.c b/vnet/vnet/dhcp/proxy_node.c
new file mode 100644
index 00000000000..2f860226065
--- /dev/null
+++ b/vnet/vnet/dhcp/proxy_node.c
@@ -0,0 +1,1144 @@
+/*
+ * proxy_node.c: dhcp proxy node processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/dhcp/proxy.h>
+
+static char * dhcp_proxy_error_strings[] = {
+#define dhcp_proxy_error(n,s) s,
+#include "proxy_error.def"
+#undef dhcp_proxy_error
+};
+
+#define foreach_dhcp_proxy_to_server_input_next \
+ _ (DROP, "error-drop") \
+ _ (LOOKUP, "ip4-lookup") \
+ _ (SEND_TO_CLIENT, "dhcp-proxy-to-client")
+
+typedef enum {
+#define _(s,n) DHCP_PROXY_TO_SERVER_INPUT_NEXT_##s,
+ foreach_dhcp_proxy_to_server_input_next
+#undef _
+ DHCP_PROXY_TO_SERVER_INPUT_N_NEXT,
+} dhcp_proxy_to_server_input_next_t;
+
+typedef struct {
+ /* 0 => to server, 1 => to client */
+ int which;
+ ip4_address_t trace_ip4_address;
+ u32 error;
+ u32 sw_if_index;
+ u32 original_sw_if_index;
+} dhcp_proxy_trace_t;
+
+#define VPP_DHCP_OPTION82_SUB1_SIZE 6
+#define VPP_DHCP_OPTION82_SUB5_SIZE 6
+#define VPP_DHCP_OPTION82_VSS_SIZE 12
+#define VPP_DHCP_OPTION82_SIZE (VPP_DHCP_OPTION82_SUB1_SIZE + \
+ VPP_DHCP_OPTION82_SUB5_SIZE + \
+ VPP_DHCP_OPTION82_VSS_SIZE +3)
+
+vlib_node_registration_t dhcp_proxy_to_server_node;
+vlib_node_registration_t dhcp_proxy_to_client_node;
+
+u8 * format_dhcp_proxy_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ dhcp_proxy_trace_t * t = va_arg (*args, dhcp_proxy_trace_t *);
+
+ if (t->which == 0)
+ s = format (s, "DHCP proxy: sent to server %U\n",
+ format_ip4_address, &t->trace_ip4_address, t->error);
+ else
+ s = format (s, "DHCP proxy: broadcast to client from %U\n",
+ format_ip4_address, &t->trace_ip4_address);
+
+ if (t->error != (u32)~0)
+ s = format (s, " error: %s\n", dhcp_proxy_error_strings[t->error]);
+
+ s = format (s, " original_sw_if_index: %d, sw_if_index: %d\n",
+ t->original_sw_if_index, t->sw_if_index);
+
+ return s;
+}
+
+u8 * format_dhcp_proxy_header_with_length (u8 * s, va_list * args)
+{
+ dhcp_header_t * h = va_arg (*args, dhcp_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "dhcp header truncated");
+
+ s = format (s, "DHCP Proxy");
+
+ return s;
+}
+
+/* get first interface address */
+static ip4_address_t *
+ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip4_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+ result = a;
+ break;
+ }));
+ return result;
+}
+
+static uword
+dhcp_proxy_to_server_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ dhcp_proxy_main_t * dpm = &dhcp_proxy_main;
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ u32 pkts_to_server=0, pkts_to_client=0, pkts_no_server=0;
+ u32 pkts_no_interface_address=0;
+ u32 pkts_too_big=0;
+ ip4_main_t * im = &ip4_main;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ udp_header_t * u0;
+ dhcp_header_t * h0;
+ ip4_header_t * ip0;
+ u32 next0;
+ u32 old0, new0;
+ ip_csum_t sum0;
+ u32 error0 = (u32) ~0;
+ u32 sw_if_index = 0;
+ u32 original_sw_if_index = 0;
+ u8 *end = NULL;
+ u32 fib_index, server_index;
+ dhcp_server_t * server;
+ u32 rx_sw_if_index;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = vlib_buffer_get_current (b0);
+
+ /*
+ * udp_local hands us the DHCP header, need udp hdr,
+ * ip hdr to relay to server
+ */
+ vlib_buffer_advance (b0, -(sizeof(*u0)));
+ u0 = vlib_buffer_get_current (b0);
+
+ /* This blows. Return traffic has src_port = 67, dst_port = 67 */
+ if (u0->src_port == clib_net_to_host_u16(UDP_DST_PORT_dhcp_to_server))
+ {
+ vlib_buffer_advance (b0, sizeof(*u0));
+ next0 = DHCP_PROXY_TO_SERVER_INPUT_NEXT_SEND_TO_CLIENT;
+ error0 = 0;
+ pkts_to_client++;
+ goto do_enqueue;
+ }
+
+ rx_sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ fib_index = im->fib_index_by_sw_if_index [rx_sw_if_index];
+
+ if (fib_index < vec_len(dpm->dhcp_server_index_by_rx_fib_index))
+ server_index = dpm->dhcp_server_index_by_rx_fib_index[fib_index];
+ else
+ server_index = 0;
+
+ if (PREDICT_FALSE (pool_is_free_index (dpm->dhcp_servers,
+ server_index)))
+ {
+ no_server:
+ error0 = DHCP_PROXY_ERROR_NO_SERVER;
+ next0 = DHCP_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_no_server++;
+ goto do_trace;
+ }
+
+ server = pool_elt_at_index (dpm->dhcp_servers, server_index);
+ if (server->valid == 0)
+ goto no_server;
+
+ vlib_buffer_advance (b0, -(sizeof(*ip0)));
+ ip0 = vlib_buffer_get_current (b0);
+
+ /* disable UDP checksum */
+ u0->checksum = 0;
+ sum0 = ip0->checksum;
+ old0 = ip0->dst_address.as_u32;
+ new0 = server->dhcp_server.as_u32;
+ ip0->dst_address.as_u32 = server->dhcp_server.as_u32;
+ sum0 = ip_csum_update (sum0, old0, new0,
+ ip4_header_t /* structure */,
+ dst_address /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+
+ sum0 = ip0->checksum;
+ old0 = ip0->src_address.as_u32;
+ new0 = server->dhcp_src_address.as_u32;
+ ip0->src_address.as_u32 = new0;
+ sum0 = ip_csum_update (sum0, old0, new0,
+ ip4_header_t /* structure */,
+ src_address /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+
+ /* Send to DHCP server via the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ server->server_fib_index;
+
+ h0->gateway_ip_address.as_u32 = server->dhcp_src_address.as_u32;
+ pkts_to_server++;
+
+ if (server->insert_option_82)
+ {
+ u32 fib_index, fib_id, opt82_fib_id=0, opt82_oui=0;
+ ip4_fib_t * fib;
+ dhcp_option_t *o = (dhcp_option_t *) h0->options;
+ u32 len = 0;
+ vlib_buffer_free_list_t *fl;
+
+ fib_index = im->fib_index_by_sw_if_index
+ [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
+ fib = vec_elt_at_index (im->fibs, fib_index);
+ fib_id = fib->table_id;
+
+ end = b0->data + b0->current_data + b0->current_length;
+ /* TLVs are not performance-friendly... */
+ while (o->option != 0xFF /* end of options */ && (u8 *)o < end)
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+
+ fl = vlib_buffer_get_free_list (vm, b0->free_list_index);
+ // start write at (option*)o, some packets have padding
+ if (((u8 *)o - (u8 *)b0->data + VPP_DHCP_OPTION82_SIZE) > fl->n_data_bytes)
+ {
+ next0 = DHCP_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_too_big++;
+ goto do_trace;
+ }
+
+ if ((o->option == 0xFF) && ((u8 *)o <= end))
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ u16 old_l0, new_l0;
+ ip4_address_t _ia0, * ia0 = &_ia0;
+ uword *p_vss;
+ vss_info *vss;
+ vnet_sw_interface_t *swif;
+ sw_if_index = 0;
+ original_sw_if_index = 0;
+
+ original_sw_if_index = sw_if_index =
+ vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ swif = vnet_get_sw_interface (vnm, sw_if_index);
+ if (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
+ sw_if_index = swif->unnumbered_sw_if_index;
+
+ p_vss = hash_get (dpm->opt82vss_index_by_vrf_id,
+ fib_id);
+ if (p_vss)
+ {
+ vss = pool_elt_at_index (dpm->opt82vss, p_vss[0]);
+ opt82_oui = vss->vpn_id.oui;
+ opt82_fib_id = vss->vpn_id.fib_id;
+ }
+ /*
+ * Get the first ip4 address on the [client-side]
+ * RX interface, if not unnumbered. otherwise use
+ * the loopback interface's ip address.
+ */
+ ia0 = ip4_interface_first_address(&ip4_main, sw_if_index);
+
+ if (ia0 == 0)
+ {
+ error0 = DHCP_PROXY_ERROR_NO_INTERFACE_ADDRESS;
+ next0 = DHCP_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_no_interface_address++;
+ goto do_trace;
+ }
+
+ /* Add option 82 */
+ o->option = 82; /* option 82 */
+ o->length = 12; /* 12 octets to follow */
+ o->data[0] = 1; /* suboption 1, circuit ID (=FIB id) */
+ o->data[1] = 4; /* length of suboption */
+ o->data[2] = (original_sw_if_index >> 24) & 0xFF;
+ o->data[3] = (original_sw_if_index >> 16) & 0xFF;
+ o->data[4] = (original_sw_if_index >> 8) & 0xFF;
+ o->data[5] = (original_sw_if_index >> 0) & 0xFF;
+ o->data[6] = 5; /* suboption 5 (client RX intfc address) */
+ o->data[7] = 4; /* length 4 */
+ o->data[8] = ia0->as_u8[0];
+ o->data[9] = ia0->as_u8[1];
+ o->data[10] = ia0->as_u8[2];
+ o->data[11] = ia0->as_u8[3];
+ o->data[12] = 0xFF;
+ if (opt82_oui !=0 || opt82_fib_id != 0)
+ {
+ o->data[12] = 151; /* vss suboption */
+ if (255 == opt82_fib_id) {
+ o->data[13] = 1; /* length */
+ o->data[14] = 255; /* vss option type */
+ o->data[15] = 152; /* vss control suboption */
+ o->data[16] = 0; /* length */
+ /* and a new "end-of-options" option (0xff) */
+ o->data[17] = 0xFF;
+ o->length += 5;
+ } else {
+ o->data[13] = 8; /* length */
+ o->data[14] = 1; /* vss option type */
+ o->data[15] = (opt82_oui >> 16) & 0xff;
+ o->data[16] = (opt82_oui >> 8) & 0xff;
+ o->data[17] = (opt82_oui ) & 0xff;
+ o->data[18] = (opt82_fib_id >> 24) & 0xff;
+ o->data[19] = (opt82_fib_id >> 16) & 0xff;
+ o->data[20] = (opt82_fib_id >> 8) & 0xff;
+ o->data[21] = (opt82_fib_id) & 0xff;
+ o->data[22] = 152; /* vss control suboption */
+ o->data[23] = 0; /* length */
+
+ /* and a new "end-of-options" option (0xff) */
+ o->data[24] = 0xFF;
+ o->length += 12;
+ }
+ }
+
+ len = o->length + 3;
+ b0->current_length += len;
+ /* Fix IP header length and checksum */
+ old_l0 = ip0->length;
+ new_l0 = clib_net_to_host_u16 (old_l0);
+ new_l0 += len;
+ new_l0 = clib_host_to_net_u16 (new_l0);
+ ip0->length = new_l0;
+ sum0 = ip0->checksum;
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+
+ /* Fix UDP length */
+ new_l0 = clib_net_to_host_u16 (u0->length);
+ new_l0 += len;
+ u0->length = clib_host_to_net_u16 (new_l0);
+ } else {
+ vlib_node_increment_counter
+ (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_OPTION_82_ERROR, 1);
+ }
+ }
+
+ next0 = DHCP_PROXY_TO_SERVER_INPUT_NEXT_LOOKUP;
+
+ do_trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ dhcp_proxy_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->which = 0; /* to server */
+ tr->error = error0;
+ tr->original_sw_if_index = original_sw_if_index;
+ tr->sw_if_index = sw_if_index;
+ if (next0 == DHCP_PROXY_TO_SERVER_INPUT_NEXT_LOOKUP)
+ tr->trace_ip4_address.as_u32 = server->dhcp_server.as_u32;
+ }
+
+ do_enqueue:
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_RELAY_TO_CLIENT,
+ pkts_to_client);
+ vlib_node_increment_counter (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_RELAY_TO_SERVER,
+ pkts_to_server);
+ vlib_node_increment_counter (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_NO_SERVER,
+ pkts_no_server);
+ vlib_node_increment_counter (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_NO_INTERFACE_ADDRESS,
+ pkts_no_interface_address);
+ vlib_node_increment_counter (vm, dhcp_proxy_to_server_node.index,
+ DHCP_PROXY_ERROR_PKT_TOO_BIG,
+ pkts_too_big);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dhcp_proxy_to_server_node) = {
+ .function = dhcp_proxy_to_server_input,
+ .name = "dhcp-proxy-to-server",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = DHCP_PROXY_N_ERROR,
+ .error_strings = dhcp_proxy_error_strings,
+
+ .n_next_nodes = DHCP_PROXY_TO_SERVER_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [DHCP_PROXY_TO_SERVER_INPUT_NEXT_##s] = n,
+ foreach_dhcp_proxy_to_server_input_next
+#undef _
+ },
+
+ .format_buffer = format_dhcp_proxy_header_with_length,
+ .format_trace = format_dhcp_proxy_trace,
+#if 0
+ .unformat_buffer = unformat_dhcp_proxy_header,
+#endif
+};
+
+static uword
+dhcp_proxy_to_client_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, * from;
+ ethernet_main_t *em = ethernet_get_main (vm);
+ dhcp_proxy_main_t * dpm = &dhcp_proxy_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ udp_header_t * u0;
+ dhcp_header_t * h0;
+ ip4_header_t * ip0 = 0;
+ ip4_address_t * ia0 = 0;
+ u32 old0, new0;
+ ip_csum_t sum0;
+ ethernet_interface_t *ei0;
+ ethernet_header_t *mac0;
+ vnet_hw_interface_t *hi0;
+ vlib_frame_t *f0;
+ u32 * to_next0;
+ u32 sw_if_index = ~0;
+ vnet_sw_interface_t *si0;
+ u32 error0 = (u32)~0;
+ vnet_sw_interface_t *swif;
+ u32 server_index;
+ u32 fib_index;
+ dhcp_server_t * server;
+ u32 original_sw_if_index = (u32) ~0;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ /*
+ * udp_local hands us the DHCP header, need udp hdr,
+ * ip hdr to relay to client
+ */
+ vlib_buffer_advance (b0, -(sizeof(*u0)));
+ u0 = vlib_buffer_get_current (b0);
+
+ vlib_buffer_advance (b0, -(sizeof(*ip0)));
+ ip0 = vlib_buffer_get_current (b0);
+
+ /* Consumed by dhcp client code? */
+ if (dhcp_client_for_us (bi0, b0, ip0, u0, h0))
+ continue;
+
+ if (1 /* dpm->insert_option_82 */)
+ {
+ dhcp_option_t *o = (dhcp_option_t *) h0->options;
+ dhcp_option_t *sub;
+
+ /* Parse through TLVs looking for option 82.
+ The circuit-ID is the FIB number we need
+ to track down the client-facing interface */
+
+ while (o->option != 0xFF /* end of options */ &&
+ (u8 *) o < (b0->data + b0->current_data + b0->current_length))
+ {
+ if (o->option == 82)
+ {
+ u32 vss_exist = 0;
+ u32 vss_ctrl = 0;
+ sub = (dhcp_option_t *) &o->data[0];
+ while (sub->option != 0xFF /* end of options */ &&
+ (u8 *) sub < (u8 *)(o + o->length)) {
+ /* If this is one of ours, it will have
+ total length 12, circuit-id suboption type,
+ and the sw_if_index */
+ if (sub->option == 1 && sub->length == 4)
+ {
+ sw_if_index = (o->data[2] << 24)
+ | (o->data[3] << 16)
+ | (o->data[4] << 8)
+ | (o->data[5]);
+ } else if (sub->option == 151 &&
+ sub->length == 7 &&
+ sub->data[0] == 1)
+ vss_exist = 1;
+ else if (sub->option == 152 && sub->length == 0)
+ vss_ctrl = 1;
+ sub = (dhcp_option_t *)
+ (((uword) sub) + (sub->length + 2));
+ }
+ if (vss_ctrl && vss_exist)
+ vlib_node_increment_counter
+ (vm, dhcp_proxy_to_client_node.index,
+ DHCP_PROXY_ERROR_OPTION_82_VSS_NOT_PROCESSED, 1);
+
+ }
+ o = (dhcp_option_t *) (((uword) o) + (o->length + 2));
+ }
+ }
+
+ if (sw_if_index == (u32)~0)
+ {
+ error0 = DHCP_PROXY_ERROR_NO_OPTION_82;
+
+ drop_packet:
+ vlib_node_increment_counter (vm, dhcp_proxy_to_client_node.index,
+ error0, 1);
+ f0 = vlib_get_frame_to_node (vm, dpm->error_drop_node_index);
+ to_next0 = vlib_frame_vector_args (f0);
+ to_next0[0] = bi0;
+ f0->n_vectors = 1;
+ vlib_put_frame_to_node (vm, dpm->error_drop_node_index, f0);
+ goto do_trace;
+ }
+
+
+ if (sw_if_index >= vec_len (im->fib_index_by_sw_if_index))
+ {
+ error0 = DHCP_PROXY_ERROR_BAD_OPTION_82;
+ goto drop_packet;
+ }
+
+ fib_index = im->fib_index_by_sw_if_index [sw_if_index];
+
+ if (fib_index < vec_len(dpm->dhcp_server_index_by_rx_fib_index))
+ server_index = dpm->dhcp_server_index_by_rx_fib_index[fib_index];
+ else
+ server_index = 0;
+
+ if (PREDICT_FALSE (pool_is_free_index (dpm->dhcp_servers,
+ server_index)))
+ {
+ error0 = DHCP_PROXY_ERROR_BAD_OPTION_82;
+ goto drop_packet;
+ }
+
+ server = pool_elt_at_index (dpm->dhcp_servers, server_index);
+ if (server->valid == 0)
+ {
+ error0 = DHCP_PROXY_ERROR_NO_SERVER;
+ goto drop_packet;
+ }
+
+ if (ip0->src_address.as_u32 != server->dhcp_server.as_u32)
+ {
+ error0 = DHCP_PROXY_ERROR_BAD_SVR_FIB_OR_ADDRESS;
+ goto drop_packet;
+ }
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index;
+
+ swif = vnet_get_sw_interface (vnm, sw_if_index);
+ original_sw_if_index = sw_if_index;
+ if (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
+ sw_if_index = swif->unnumbered_sw_if_index;
+
+ ia0 = ip4_interface_first_address (&ip4_main, sw_if_index);
+ if (ia0 == 0)
+ {
+ error0 = DHCP_PROXY_ERROR_NO_INTERFACE_ADDRESS;
+ goto drop_packet;
+ }
+
+ u0->checksum = 0;
+ u0->dst_port = clib_net_to_host_u16 (UDP_DST_PORT_dhcp_to_client);
+ sum0 = ip0->checksum;
+ old0 = ip0->dst_address.as_u32;
+ new0 = 0xFFFFFFFF;
+ ip0->dst_address.as_u32 = new0;
+ sum0 = ip_csum_update (sum0, old0, new0,
+ ip4_header_t /* structure */,
+ dst_address /* offset of changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+
+ sum0 = ip0->checksum;
+ old0 = ip0->src_address.as_u32;
+ new0 = ia0->as_u32;
+ ip0->src_address.as_u32 = new0;
+ sum0 = ip_csum_update (sum0, old0, new0,
+ ip4_header_t /* structure */,
+ src_address /* offset of changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+
+ vlib_buffer_advance (b0, -(sizeof(ethernet_header_t)));
+ si0 = vnet_get_sw_interface (vnm, original_sw_if_index);
+ if (si0->type == VNET_SW_INTERFACE_TYPE_SUB)
+ vlib_buffer_advance (b0, -4 /* space for VLAN tag */);
+
+ mac0 = vlib_buffer_get_current (b0);
+
+ hi0 = vnet_get_sup_hw_interface (vnm, original_sw_if_index);
+ ei0 = pool_elt_at_index (em->interfaces, hi0->hw_instance);
+ memcpy (mac0->src_address, ei0->address, sizeof (ei0->address));
+ memset (mac0->dst_address, 0xff, sizeof (mac0->dst_address));
+ mac0->type = (si0->type == VNET_SW_INTERFACE_TYPE_SUB) ?
+ clib_net_to_host_u16(0x8100) : clib_net_to_host_u16 (0x0800);
+
+ if (si0->type == VNET_SW_INTERFACE_TYPE_SUB)
+ {
+ u32 * vlan_tag = (u32 *)(mac0+1);
+ u32 tmp;
+ tmp = (si0->sub.id << 16) | 0x0800;
+ *vlan_tag = clib_host_to_net_u32 (tmp);
+ }
+
+ /* $$$ This needs to be rewritten, for sure */
+ f0 = vlib_get_frame_to_node (vm, hi0->output_node_index);
+ to_next0 = vlib_frame_vector_args (f0);
+ to_next0[0] = bi0;
+ f0->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi0->output_node_index, f0);
+
+ do_trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ dhcp_proxy_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->which = 1; /* to client */
+ tr->trace_ip4_address.as_u32 = ia0 ? ia0->as_u32 : 0;
+ tr->error = error0;
+ tr->original_sw_if_index = original_sw_if_index;
+ tr->sw_if_index = sw_if_index;
+ }
+ }
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dhcp_proxy_to_client_node) = {
+ .function = dhcp_proxy_to_client_input,
+ .name = "dhcp-proxy-to-client",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = DHCP_PROXY_N_ERROR,
+ .error_strings = dhcp_proxy_error_strings,
+ .format_buffer = format_dhcp_proxy_header_with_length,
+ .format_trace = format_dhcp_proxy_trace,
+#if 0
+ .unformat_buffer = unformat_dhcp_proxy_header,
+#endif
+};
+
+clib_error_t * dhcp_proxy_init (vlib_main_t * vm)
+{
+ dhcp_proxy_main_t * dm = &dhcp_proxy_main;
+ vlib_node_t * error_drop_node;
+ dhcp_server_t * server;
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main();
+ error_drop_node = vlib_get_node_by_name (vm, (u8 *) "error-drop");
+ dm->error_drop_node_index = error_drop_node->index;
+
+ dm->opt82vss_index_by_vrf_id = hash_create (0, sizeof (uword));
+
+ udp_register_dst_port (vm, UDP_DST_PORT_dhcp_to_client,
+ dhcp_proxy_to_client_node.index, 1 /* is_ip4 */);
+
+ udp_register_dst_port (vm, UDP_DST_PORT_dhcp_to_server,
+ dhcp_proxy_to_server_node.index, 1 /* is_ip4 */);
+
+ /* Create the default server, don't mark it valid */
+ pool_get (dm->dhcp_servers, server);
+ memset (server, 0, sizeof (*server));
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dhcp_proxy_init);
+
+int dhcp_proxy_set_server_2 (ip4_address_t *addr, ip4_address_t *src_address,
+ u32 rx_fib_id,
+ u32 server_fib_id,
+ int insert_option_82, int is_del)
+{
+ dhcp_proxy_main_t * dpm = &dhcp_proxy_main;
+ ip4_main_t * im = &ip4_main;
+ dhcp_server_t * server = 0;
+ ip4_fib_t *rx_fib, *server_fib;
+ u32 server_index = 0;
+ u32 rx_fib_index = 0;
+
+ if (addr->as_u32 == 0)
+ return VNET_API_ERROR_INVALID_DST_ADDRESS;
+
+ if (src_address->as_u32 == 0)
+ return VNET_API_ERROR_INVALID_SRC_ADDRESS;
+
+ rx_fib = find_ip4_fib_by_table_index_or_id
+ (&ip4_main, rx_fib_id, IP4_ROUTE_FLAG_TABLE_ID);
+
+ if (rx_fib == 0)
+ return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+
+ server_fib = find_ip4_fib_by_table_index_or_id
+ (&ip4_main, server_fib_id, IP4_ROUTE_FLAG_TABLE_ID);
+
+ if (server_fib == 0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ if (rx_fib_id == 0)
+ {
+ server = pool_elt_at_index (dpm->dhcp_servers, 0);
+
+ if (is_del)
+ {
+ memset (server, 0, sizeof (*server));
+ return 0;
+ }
+ goto initialize_it;
+ }
+
+ rx_fib_index = rx_fib - im->fibs;
+
+ if (is_del)
+ {
+ if (rx_fib_index >= vec_len(dpm->dhcp_server_index_by_rx_fib_index))
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ server_index = dpm->dhcp_server_index_by_rx_fib_index[rx_fib_index];
+ ASSERT(server_index > 0);
+
+ /* Use the default server again. */
+ dpm->dhcp_server_index_by_rx_fib_index[rx_fib_index] = 0;
+ server = pool_elt_at_index (dpm->dhcp_servers, server_index);
+ memset (server, 0, sizeof (*server));
+ pool_put (dpm->dhcp_servers, server);
+ return 0;
+ }
+
+ if (rx_fib_index < vec_len(dpm->dhcp_server_index_by_rx_fib_index))
+ {
+ server_index = dpm->dhcp_server_index_by_rx_fib_index[rx_fib_index];
+ if (server_index != 0)
+ {
+ server = pool_elt_at_index (dpm->dhcp_servers, server_index);
+ goto initialize_it;
+ }
+ }
+
+ pool_get (dpm->dhcp_servers, server);
+
+ initialize_it:
+
+ server->dhcp_server.as_u32 = addr->as_u32;
+ server->server_fib_index = server_fib - im->fibs;
+ server->dhcp_src_address.as_u32 = src_address->as_u32;
+ server->insert_option_82 = insert_option_82;
+ server->valid = 1;
+ if (rx_fib_index)
+ {
+ vec_validate (dpm->dhcp_server_index_by_rx_fib_index, rx_fib_index);
+ dpm->dhcp_server_index_by_rx_fib_index[rx_fib_index] =
+ server - dpm->dhcp_servers;
+ }
+
+ return 0;
+}
+
+/* Old API, manipulates the default server (only) */
+int dhcp_proxy_set_server (ip4_address_t *addr, ip4_address_t *src_address,
+ u32 fib_id, int insert_option_82, int is_del)
+{
+ return dhcp_proxy_set_server_2 (addr, src_address, 0 /* rx_fib_id */,
+ fib_id /* server_fib_id */,
+ insert_option_82, is_del);
+}
+
+
+static clib_error_t *
+dhcp_proxy_set_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip4_address_t server_addr, src_addr;
+ u32 server_fib_id = 0, rx_fib_id = 0;
+ int is_del = 0;
+ int add_option_82 = 0;
+ int set_src = 0, set_server = 0;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "server %U",
+ unformat_ip4_address, &server_addr))
+ set_server = 1;
+ else if (unformat (input, "server-fib-id %d", &server_fib_id))
+ ;
+ else if (unformat (input, "rx-fib-id %d", &rx_fib_id))
+ ;
+ else if (unformat(input, "src-address %U",
+ unformat_ip4_address, &src_addr))
+ set_src = 1;
+ else if (unformat (input, "add-option-82")
+ || unformat (input, "insert-option-82"))
+ add_option_82 = 1;
+ else if (unformat (input, "delete") ||
+ unformat (input, "del"))
+ is_del = 1;
+ else
+ break;
+ }
+
+ if (is_del || (set_server && set_src))
+ {
+ int rv;
+
+ rv = dhcp_proxy_set_server_2 (&server_addr, &src_addr, rx_fib_id,
+ server_fib_id, add_option_82, is_del);
+ switch (rv)
+ {
+ case 0:
+ return 0;
+
+ case VNET_API_ERROR_INVALID_DST_ADDRESS:
+ return clib_error_return (0, "Invalid server address");
+
+ case VNET_API_ERROR_INVALID_SRC_ADDRESS:
+ return clib_error_return (0, "Invalid src address");
+
+ case VNET_API_ERROR_NO_SUCH_INNER_FIB:
+ return clib_error_return (0, "No such rx fib id %d", rx_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "No such server fib id %d",
+ server_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return
+ (0, "Fib id %d: no per-fib DHCP server configured", rx_fib_id);
+
+ default:
+ return clib_error_return (0, "BUG: rv %d", rv);
+ }
+ }
+ else
+ return clib_error_return (0, "parse error`%U'",
+ format_unformat_error, input);
+}
+
+VLIB_CLI_COMMAND (dhcp_proxy_set_command, static) = {
+ .path = "set dhcp proxy",
+ .short_help = "set dhcp proxy [del] server <ip-addr> src-address <ip-addr> [add-option-82] [server-fib-id <n>] [rx-fib-id <n>]",
+ .function = dhcp_proxy_set_command_fn,
+};
+
+u8 * format_dhcp_proxy_server (u8 * s, va_list * args)
+{
+ dhcp_proxy_main_t * dm = va_arg (*args, dhcp_proxy_main_t *);
+ dhcp_server_t * server = va_arg (*args, dhcp_server_t *);
+ u32 rx_fib_index = va_arg (*args, u32);
+ ip4_fib_t * rx_fib, * server_fib;
+ u32 server_fib_id = ~0, rx_fib_id = ~0;
+
+ if (dm == 0)
+ {
+ s = format (s, "%=16s%=16s%=14s%=14s%=20s", "Server", "Src Address",
+ "Server FIB", "RX FIB", "Insert Option 82");
+ return s;
+ }
+
+ server_fib = find_ip4_fib_by_table_index_or_id
+ (&ip4_main, server->server_fib_index, IP4_ROUTE_FLAG_FIB_INDEX);
+
+ if (server_fib)
+ server_fib_id = server_fib->table_id;
+
+ rx_fib = find_ip4_fib_by_table_index_or_id
+ (&ip4_main, rx_fib_index, IP4_ROUTE_FLAG_FIB_INDEX);
+
+ if (rx_fib)
+ rx_fib_id = rx_fib->table_id;
+
+ s = format (s, "%=16U%=16U%=14u%=14u%=20s",
+ format_ip4_address, &server->dhcp_server,
+ format_ip4_address, &server->dhcp_src_address,
+ server_fib_id, rx_fib_id,
+ server->insert_option_82 ? "yes" : "no");
+ return s;
+}
+
+static clib_error_t *
+dhcp_proxy_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dhcp_proxy_main_t * dpm = &dhcp_proxy_main;
+ ip4_main_t * im = &ip4_main;
+ dhcp_server_t * server;
+ u32 server_index;
+ int i;
+
+ vlib_cli_output (vm, "%U", format_dhcp_proxy_server, 0 /* header line */,
+ 0, 0);
+
+ for (i = 0; i < vec_len (im->fibs); i++)
+ {
+ if (i < vec_len(dpm->dhcp_server_index_by_rx_fib_index))
+ server_index = dpm->dhcp_server_index_by_rx_fib_index[i];
+ else
+ server_index = 0;
+ server = pool_elt_at_index (dpm->dhcp_servers, server_index);
+ if (server->valid)
+ vlib_cli_output (vm, "%U", format_dhcp_proxy_server, dpm,
+ server, i);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcp_proxy_show_command, static) = {
+ .path = "show dhcp proxy",
+ .short_help = "Display dhcp proxy server info",
+ .function = dhcp_proxy_show_command_fn,
+};
+
+
+int dhcp_proxy_set_option82_vss( u32 vrf_id,
+ u32 oui,
+ u32 fib_id,
+ int is_del)
+{
+ dhcp_proxy_main_t *dm = &dhcp_proxy_main;
+ uword *p;
+ vss_info *a;
+ u32 old_oui=0, old_fib_id=0;
+
+ p = hash_get (dm->opt82vss_index_by_vrf_id, vrf_id);
+
+ if (p)
+ {
+ a = pool_elt_at_index (dm->opt82vss, p[0]);
+ if (!a)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+ old_oui = a->vpn_id.oui;
+ old_fib_id = a->vpn_id.fib_id;
+
+ if (is_del)
+ {
+ if (old_oui == oui &&
+ old_fib_id == fib_id)
+ {
+ pool_put(dm->opt82vss, a);
+ hash_unset (dm->opt82vss_index_by_vrf_id, vrf_id);
+ return 0;
+ }
+ else
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ }
+ pool_put(dm->opt82vss, a);
+ hash_unset (dm->opt82vss_index_by_vrf_id, vrf_id);
+ } else if (is_del)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ pool_get (dm->opt82vss, a);
+ memset (a, ~0, sizeof (a[0]));
+ a->vpn_id.oui = oui;
+ a->vpn_id.fib_id = fib_id;
+ hash_set (dm->opt82vss_index_by_vrf_id, vrf_id, a - dm->opt82vss);
+
+ return 0;
+}
+
+static clib_error_t *
+dhcp_option_82_vss_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int is_del = 0, got_new_vpn_id=0;
+ u32 oui=0, fib_id=0, tbl_id=~0;
+
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+
+ if (unformat(input, "delete") || unformat(input, "del"))
+ is_del = 1;
+ else if (unformat (input, "oui %d", &oui))
+ got_new_vpn_id = 1;
+ else if (unformat (input, "vpn-id %d", &fib_id))
+ got_new_vpn_id = 1;
+ else if (unformat (input, "table %d", &tbl_id))
+ got_new_vpn_id = 1;
+ else
+ break;
+ }
+ if (tbl_id == ~0)
+ return clib_error_return (0, "no table ID specified.");
+
+ if (is_del || got_new_vpn_id)
+ {
+ int rv;
+ rv = dhcp_proxy_set_option82_vss(tbl_id, oui, fib_id, is_del);
+ switch (rv)
+ {
+ case 0:
+ return 0;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "option 82 vss(oui:%d, vpn-id:%d) not found in table %d",
+ oui, fib_id, tbl_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "option 82 vss for table %d not found in in pool.",
+ tbl_id);
+ default:
+ return clib_error_return (0, "BUG: rv %d", rv);
+ }
+ }
+ else
+ return clib_error_return (0, "parse error`%U'",
+ format_unformat_error, input);
+}
+
+VLIB_CLI_COMMAND (dhcp_proxy_vss_command,static) = {
+ .path = "set dhcp option-82 vss",
+ .short_help = "set dhcp option-82 vss [del] table <table id> oui <oui> vpn-id <vpn-id>",
+ .function = dhcp_option_82_vss_fn,
+};
+
+
+static clib_error_t *
+dhcp_vss_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+
+{
+ dhcp_proxy_main_t * dm = &dhcp_proxy_main;
+ vss_info *v;
+ u32 oui;
+ u32 fib_id;
+ u32 tbl_id;
+ uword index;
+
+ vlib_cli_output (vm, "%=9s%=11s%=12s","Table", "OUI", "VPN-ID");
+ hash_foreach (tbl_id, index, dm->opt82vss_index_by_vrf_id,
+ ({
+ v = pool_elt_at_index (dm->opt82vss, index);
+ oui = v->vpn_id.oui;
+ fib_id = v->vpn_id.fib_id;
+ vlib_cli_output (vm, "%=9d 0x%08x%=12d",
+ tbl_id, oui, fib_id);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcp_proxy_vss_show_command, static) = {
+ .path = "show dhcp vss",
+ .short_help = "show dhcp VSS",
+ .function = dhcp_vss_show_command_fn,
+};
+
+static clib_error_t *
+dhcp_option_82_address_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+
+{
+ dhcp_proxy_main_t *dm = &dhcp_proxy_main;
+ vnet_main_t *vnm = vnet_get_main();
+ u32 sw_if_index0=0, sw_if_index;
+ ip4_address_t *ia0;
+ vnet_sw_interface_t *swif;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+
+ if (unformat(input, "%U",
+ unformat_vnet_sw_interface, dm->vnet_main, &sw_if_index0))
+ {
+ swif = vnet_get_sw_interface (vnm, sw_if_index0);
+ sw_if_index = (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED) ?
+ swif->unnumbered_sw_if_index : sw_if_index0;
+ ia0 = ip4_interface_first_address(&ip4_main, sw_if_index);
+ if (ia0)
+ {
+ vlib_cli_output (vm, "%=20s%=20s", "interface",
+ "source IP address");
+
+ vlib_cli_output (vm, "%=20U%=20U",
+ format_vnet_sw_if_index_name,
+ dm->vnet_main, sw_if_index0,
+ format_ip4_address, ia0);
+ }
+ else
+ vlib_cli_output (vm, "%=34s %=20U",
+ "No IPv4 address configured on",
+ format_vnet_sw_if_index_name,
+ dm->vnet_main, sw_if_index);
+ }
+ else
+ break;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcp_proxy_address_show_command,static) = {
+ .path = "show dhcp option-82-address interface",
+ .short_help = "show dhcp option-82-address interface <interface>",
+ .function = dhcp_option_82_address_show_command_fn,
+};
diff --git a/vnet/vnet/dhcpv6/packet.h b/vnet/vnet/dhcpv6/packet.h
new file mode 100644
index 00000000000..8634b5d8e9b
--- /dev/null
+++ b/vnet/vnet/dhcpv6/packet.h
@@ -0,0 +1,183 @@
+#ifndef included_vnet_dhcp_packet_h
+#define included_vnet_dhcp_packet_h
+
+/*
+ * DHCP packet format
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip6_packet.h>
+
+// #define DHCP_VRF_NAME_MAX_LEN L3VM_MAX_NAME_STR_LEN
+// #define DHCPV6_MAX_VRF_NAME_LEN L3VM_MAX_NAME_STR_LEN
+#define DHCP_MAX_RELAY_ADDR 16
+#define PROTO_UDP 17
+#define DHCPV6_CLIENT_PORT 546
+#define DHCPV6_SERVER_PORT 547
+#define HOP_COUNT_LIMIT 32
+#define DHCPV6_CISCO_ENT_NUM 9
+
+/*
+ * DHCPv6 message types
+ */
+typedef enum dhcpv6_msg_type_{
+ DHCPV6_MSG_SOLICIT = 1,
+ DHCPV6_MSG_ADVERTISE = 2,
+ DHCPV6_MSG_REQUEST = 3,
+ DHCPV6_MSG_CONFIRM = 4,
+ DHCPV6_MSG_RENEW = 5,
+ DHCPV6_MSG_REBIND = 6,
+ DHCPV6_MSG_REPLY = 7,
+ DHCPV6_MSG_RELEASE = 8,
+ DHCPV6_MSG_DECLINE = 9,
+ DHCPV6_MSG_RECONFIGURE = 10,
+ DHCPV6_MSG_INFORMATION_REQUEST = 11,
+ DHCPV6_MSG_RELAY_FORW = 12,
+ DHCPV6_MSG_RELAY_REPL = 13,
+} dhcpv6_msg_type_t;
+
+/*
+ * DHCPv6 options types
+ */
+enum {
+ DHCPV6_OPTION_CLIENTID = 1,
+ DHCPV6_OPTION_SERVERID = 2,
+ DHCPV6_OPTION_IA_NA = 3,
+ DHCPV6_OPTION_IA_TA = 4,
+ DHCPV6_OPTION_IAADDR = 5,
+ DHCPV6_OPTION_ORO = 6,
+ DHCPV6_OPTION_PREFERENCE = 7,
+ DHCPV6_OPTION_ELAPSED_TIME = 8,
+ DHCPV6_OPTION_RELAY_MSG = 9,
+ DHCPV6_OPTION_AUTH = 11,
+ DHCPV6_OPTION_UNICAST = 12,
+ DHCPV6_OPTION_STATUS_CODE = 13,
+ DHCPV6_OPTION_RAPID_COMMIT = 14,
+ DHCPV6_OPTION_USER_CLASS = 15,
+ DHCPV6_OPTION_VENDOR_CLASS = 16,
+ DHCPV6_OPTION_VENDOR_OPTS = 17,
+ DHCPV6_OPTION_INTERFACE_ID = 18, // relay agent fills this
+ DHCPV6_OPTION_RECONF_MSG = 19,
+ DHCPV6_OPTION_RECONF_ACCEPT = 20,
+ DHCPV6_OPTION_REMOTEID = 37, // relay agent fills this
+ DHCPV6_OPTION_VSS = 68, // relay agent fills this
+ DHCPV6_OPTION_CLIENT_LINK_LAYER_ADDRESS = 79,
+ DHCPV6_OPTION_MAX
+};
+
+/*
+* DHCPv6 status codes
+ */
+enum {
+ DHCPV6_STATUS_SUCCESS = 0,
+ DHCPV6_STATUS_UNSPEC_FAIL = 1,
+ DHCPV6_STATUS_NOADDRS_AVAIL = 2,
+ DHCPV6_STATUS_NO_BINDING = 3,
+ DHCPV6_STATUS_NOT_ONLINK = 4,
+ DHCPV6_STATUS_USE_MULTICAST = 5,
+};
+
+/*
+ * DHCPv6 DUID types
+ */
+enum {
+ DHCPV6_DUID_LLT = 1, /* DUID Based on Link-layer Address Plus Time */
+ DHCPV6_DUID_EN = 2, /* DUID Based on Enterprise Number */
+ DHCPV6_DUID_LL = 3, /* DUID Based on Link-layer Address */
+};
+
+//Structure for DHCPv6 payload from client
+typedef struct dhcpv6_hdr_ {
+ union {
+ u8 msg_type; //DHCP msg type
+ u32 xid; // transaction id
+ }u;
+ u8 data[0];
+} dhcpv6_header_t;
+
+
+
+typedef CLIB_PACKED (struct dhcpv6_relay_ctx_ {
+ dhcpv6_header_t *pkt;
+ u32 pkt_len;
+ u32 dhcpv6_len; //DHCPv6 payload load
+// if_ordinal iod;
+ u32 if_index;
+ u32 ctx_id;
+ char ctx_name[32+1];
+ u8 dhcp_msg_type;
+}) dhcpv6_relay_ctx_t;
+
+//Structure for DHCPv6 RELAY-FORWARD and DHCPv6 RELAY-REPLY pkts
+typedef CLIB_PACKED (struct dhcpv6_relay_hdr_ {
+ u8 msg_type;
+ u8 hop_count;
+ ip6_address_t link_addr;
+ ip6_address_t peer_addr;
+ u8 data[0];
+}) dhcpv6_relay_hdr_t;
+
+typedef enum dhcp_stats_action_type_ {
+ DHCP_STATS_ACTION_FORWARDED=1,
+ DHCP_STATS_ACTION_RECEIVED,
+ DHCP_STATS_ACTION_DROPPED
+} dhcp_stats_action_type_t;
+//Generic counters for a packet
+typedef struct dhcp_stats_counters_ {
+ u64 rx_pkts; //counter for received pkts
+ u64 tx_pkts; //counter for forwarded pkts
+ u64 drops; //counter for dropped pkts
+} dhcp_stats_counters_t;
+
+
+typedef enum dhcpv6_stats_drop_reason_ {
+ DHCPV6_RELAY_PKT_DROP_RELAYDISABLE = 1,
+ DHCPV6_RELAY_PKT_DROP_MAX_HOPS,
+ DHCPV6_RELAY_PKT_DROP_VALIDATION_FAIL,
+ DHCPV6_RELAY_PKT_DROP_UNKNOWN_OP_INTF,
+ DHCPV6_RELAY_PKT_DROP_BAD_CONTEXT,
+ DHCPV6_RELAY_PKT_DROP_OPT_INSERT_FAIL,
+ DHCPV6_RELAY_PKT_DROP_REPLY_FROM_CLIENT,
+} dhcpv6_stats_drop_reason_t;
+
+typedef CLIB_PACKED (struct {
+ u16 option;
+ u16 length;
+ u8 data[0];
+}) dhcpv6_option_t;
+
+typedef CLIB_PACKED (struct {
+ dhcpv6_option_t opt;
+ u32 int_idx;
+}) dhcpv6_int_id_t;
+
+typedef CLIB_PACKED (struct {
+ dhcpv6_option_t opt;
+ u8 data[8]; // data[0]:type, data[1..7]: VPN ID
+}) dhcpv6_vss_t;
+
+typedef CLIB_PACKED (struct {
+ dhcpv6_option_t opt;
+ u32 ent_num;
+ u32 rmt_id;
+}) dhcpv6_rmt_id_t;
+
+typedef CLIB_PACKED (struct {
+ dhcpv6_option_t opt;
+ u16 link_type;
+ u8 data[6]; // data[0]:data[5]: MAC address
+}) dhcpv6_client_mac_t;
+
+
+#endif /* included_vnet_dhcp_packet_h */
diff --git a/vnet/vnet/dhcpv6/proxy.h b/vnet/vnet/dhcpv6/proxy.h
new file mode 100644
index 00000000000..bc10d7ce17e
--- /dev/null
+++ b/vnet/vnet/dhcpv6/proxy.h
@@ -0,0 +1,88 @@
+/*
+ * proxy.h: dhcp proxy
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_dhcpv6_proxy_h
+#define included_dhcpv6_proxy_h
+
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/udp.h>
+#include <vnet/dhcpv6/packet.h>
+
+typedef enum {
+#define dhcpv6_proxy_error(n,s) DHCPV6_PROXY_ERROR_##n,
+#include <vnet/dhcpv6/proxy_error.def>
+#undef dhcpv6_proxy_error
+ DHCPV6_PROXY_N_ERROR,
+} dhcpv6_proxy_error_t;
+
+typedef struct {
+ u32 oui;
+ u32 fib_id;
+} dhcpv6_vss_id;
+
+typedef union {
+ u8 as_u8[8];
+ dhcpv6_vss_id vpn_id;
+} dhcpv6_vss_info;
+
+typedef struct {
+ /* server to which we we relay. $$$ vector / pool someday */
+ ip6_address_t dhcpv6_server;
+
+ /* FIB index */
+ u32 server_fib_index;
+
+ /* source address to paste into relayed pkts */
+ ip6_address_t dhcpv6_src_address;
+
+ /* all DHCP servers address */
+ ip6_address_t all_dhcpv6_server_address;
+ ip6_address_t all_dhcpv6_server_relay_agent_address;
+
+ /* true if the relay should insert option 82 */
+ int insert_option;
+
+ /* to drop pkts in server-to-client direction */
+ u32 error_drop_node_index;
+
+ dhcpv6_vss_info *vss;
+
+ /* hash lookup specific vrf_id -> VSS vector index*/
+ uword *vss_index_by_vrf_id;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} dhcpv6_proxy_main_t;
+
+dhcpv6_proxy_main_t dhcpv6_proxy_main;
+
+int dhcpv6_proxy_set_server (ip6_address_t *addr, ip6_address_t *src_address,
+ u32 fib_id, int insert_vss, int is_del);
+
+int dhcpv6_proxy_set_vss(u32 tbl_id,
+ u32 oui,
+ u32 fib_id,
+ int is_del);
+
+#endif /* included_dhcpv6_proxy_h */
diff --git a/vnet/vnet/dhcpv6/proxy_error.def b/vnet/vnet/dhcpv6/proxy_error.def
new file mode 100644
index 00000000000..ffa1d68dda4
--- /dev/null
+++ b/vnet/vnet/dhcpv6/proxy_error.def
@@ -0,0 +1,27 @@
+/*
+ * dhcp_proxy_error.def: dhcp proxy errors
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+dhcpv6_proxy_error (NONE, "no error")
+dhcpv6_proxy_error (NO_SERVER, "no dhcpv6 server configured")
+dhcpv6_proxy_error (RELAY_TO_SERVER, "DHCPV6 packets relayed to the server")
+dhcpv6_proxy_error (RELAY_TO_CLIENT, "DHCPV6 packets relayed to clients")
+dhcpv6_proxy_error (NO_INTERFACE_ADDRESS, "DHCPV6 no interface address")
+dhcpv6_proxy_error (WRONG_MESSAGE_TYPE, "DHCPV6 wrong message type.")
+dhcpv6_proxy_error (NO_SRC_ADDRESS, "DHCPV6 no srouce IPv6 address configured.")
+dhcpv6_proxy_error (NO_CIRCUIT_ID_OPTION, "DHCPv6 reply packets without circuit ID option")
+dhcpv6_proxy_error (BAD_SVR_FIB_OR_ADDRESS, "DHCPv6 packets not from DHCPv6 server or server FIB.")
+dhcpv6_proxy_error (PKT_TOO_BIG, "DHCPv6 packets which are too big.")
diff --git a/vnet/vnet/dhcpv6/proxy_node.c b/vnet/vnet/dhcpv6/proxy_node.c
new file mode 100644
index 00000000000..e41fe9a1b3e
--- /dev/null
+++ b/vnet/vnet/dhcpv6/proxy_node.c
@@ -0,0 +1,1046 @@
+/*
+ * proxy_node.c: dhcpv6 proxy node processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/dhcpv6/proxy.h>
+
+static char * dhcpv6_proxy_error_strings[] = {
+#define dhcpv6_proxy_error(n,s) s,
+#include "proxy_error.def"
+#undef dhcpv6_proxy_error
+};
+
+#define foreach_dhcpv6_proxy_to_server_input_next \
+ _ (DROP, "error-drop") \
+ _ (LOOKUP, "ip6-lookup") \
+ _ (SEND_TO_CLIENT, "dhcpv6-proxy-to-client")
+
+
+typedef enum {
+#define _(s,n) DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_##s,
+ foreach_dhcpv6_proxy_to_server_input_next
+#undef _
+ DHCPV6_PROXY_TO_SERVER_INPUT_N_NEXT,
+} dhcpv6_proxy_to_server_input_next_t;
+
+typedef struct {
+ /* 0 => to server, 1 => to client */
+ int which;
+ u8 packet_data[64];
+ u32 error;
+ u32 sw_if_index;
+ u32 original_sw_if_index;
+} dhcpv6_proxy_trace_t;
+
+vlib_node_registration_t dhcpv6_proxy_to_server_node;
+vlib_node_registration_t dhcpv6_proxy_to_client_node;
+
+
+u8 * format_dhcpv6_proxy_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ dhcpv6_proxy_trace_t * t = va_arg (*args, dhcpv6_proxy_trace_t *);
+
+ if (t->which == 0)
+ s = format (s, "DHCPV6 proxy: sent to server %U",
+ format_ip6_address, &t->packet_data, sizeof (ip6_address_t));
+ else
+ s = format (s, "DHCPV6 proxy: sent to client from %U",
+ format_ip6_address, &t->packet_data, sizeof (ip6_address_t));
+ if (t->error != (u32)~0)
+ s = format (s, " error: %s\n", dhcpv6_proxy_error_strings[t->error]);
+
+ s = format (s, " original_sw_if_index: %d, sw_if_index: %d\n",
+ t->original_sw_if_index, t->sw_if_index);
+
+ return s;
+}
+
+u8 * format_dhcpv6_proxy_header_with_length (u8 * s, va_list * args)
+{
+ dhcpv6_header_t * h = va_arg (*args, dhcpv6_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "dhcpv6 header truncated");
+
+ s = format (s, "DHCPV6 Proxy");
+
+ return s;
+}
+/* get first interface address */
+static ip6_address_t *
+ip6_interface_first_global_or_site_address (ip6_main_t * im, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip6_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip6_address_t * a = ip_interface_address_get_address (lm, ia);
+ if ((a->as_u8[0] & 0xe0) == 0x20 ||
+ (a->as_u8[0] & 0xfe) == 0xfc) {
+ result = a;
+ break;
+ }
+ }));
+ return result;
+}
+
+/* get first interface address */
+static ip6_address_t *
+ip6_interface_first_address (ip6_main_t * im, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip6_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip6_address_t * a = ip_interface_address_get_address (lm, ia);
+ result = a;
+ break;
+ }));
+ return result;
+}
+
+static inline void copy_ip6_address (ip6_address_t *dst, ip6_address_t *src)
+{
+
+ dst->as_u64[0] = src->as_u64[0];
+ dst->as_u64[1] = src->as_u64[1];
+}
+
+static uword
+dhcpv6_proxy_to_server_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ dhcpv6_proxy_main_t * dpm = &dhcpv6_proxy_main;
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ u32 pkts_to_server=0, pkts_to_client=0;
+ u32 pkts_no_interface_address=0, pkts_no_exceeding_max_hop=0;
+ u32 pkts_no_src_address=0;
+ u32 pkts_wrong_msg_type=0;
+ u32 pkts_too_big=0;
+ ip6_main_t * im = &ip6_main;
+ u32 fib_index=0, fib_id=0;
+ ip6_fib_t * fib;
+ ip6_address_t * src;
+ int bogus_length;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ u32 sw_if_index = 0;
+ u32 original_sw_if_index = 0;
+ vnet_sw_interface_t *swif;
+ u32 bi0;
+ vlib_buffer_t * b0;
+ udp_header_t * u0, *u1;
+ dhcpv6_header_t * h0; // client msg hdr
+ ip6_header_t * ip0, *ip1;
+ ip6_address_t _ia0, *ia0=&_ia0;
+ u32 next0;
+ u32 error0 = (u32) ~0;
+ dhcpv6_option_t *fwd_opt;
+ dhcpv6_relay_hdr_t *r1;
+ u16 len;
+ dhcpv6_int_id_t *id1;
+ dhcpv6_vss_t *vss1;
+ dhcpv6_client_mac_t *cmac; // client mac
+ ethernet_header_t * e_h0;
+ u8 client_src_mac[6];
+ vlib_buffer_free_list_t *fl;
+
+ uword *p_vss;
+ u32 oui1=0;
+ u32 fib_id1;
+ dhcpv6_vss_info *vss;
+
+ fib_id1 = 0;
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = vlib_buffer_get_current (b0);
+ e_h0 = (ethernet_header_t *)b0->data;
+ memcpy(client_src_mac, e_h0->src_address, 6);
+ /* Send to DHCPV6 server via the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ dpm->server_fib_index;
+
+ /*
+ * udp_local hands us the DHCPV6 header.
+ */
+ u0 = (void *)h0 -(sizeof(*u0));
+ ip0 = (void *)u0 -(sizeof(*ip0));
+
+ switch (h0->u.msg_type) {
+ case DHCPV6_MSG_SOLICIT:
+ case DHCPV6_MSG_REQUEST:
+ case DHCPV6_MSG_CONFIRM:
+ case DHCPV6_MSG_RENEW:
+ case DHCPV6_MSG_REBIND:
+ case DHCPV6_MSG_RELEASE:
+ case DHCPV6_MSG_DECLINE:
+ case DHCPV6_MSG_INFORMATION_REQUEST:
+ case DHCPV6_MSG_RELAY_FORW:
+ /* send to server */
+ break;
+ case DHCPV6_MSG_RELAY_REPL:
+ /* send to client */
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_SEND_TO_CLIENT;
+ error0 = 0;
+ pkts_to_client++;
+ goto do_enqueue;
+ default:
+ /* drop the packet */
+ pkts_wrong_msg_type++;
+ error0 = DHCPV6_PROXY_ERROR_WRONG_MESSAGE_TYPE;
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ goto do_trace;
+
+ }
+ /* relay-option header pointer */
+ vlib_buffer_advance(b0, -(sizeof(*fwd_opt)));
+ fwd_opt = vlib_buffer_get_current(b0);
+ /* relay message header pointer */
+ vlib_buffer_advance(b0, -(sizeof(*r1)));
+ r1 = vlib_buffer_get_current(b0);
+
+ vlib_buffer_advance(b0, -(sizeof(*u1)));
+ u1 = vlib_buffer_get_current(b0);
+
+ vlib_buffer_advance(b0, -(sizeof(*ip1)));
+ ip1 = vlib_buffer_get_current(b0);
+
+ /* fill in all that rubbish... */
+ len = clib_net_to_host_u16(u0->length) - sizeof(udp_header_t);
+ copy_ip6_address(&r1->peer_addr, &ip0->src_address);
+
+ r1->msg_type = DHCPV6_MSG_RELAY_FORW;
+ fwd_opt->length = clib_host_to_net_u16(len);
+ fwd_opt->option = clib_host_to_net_u16(DHCPV6_OPTION_RELAY_MSG);
+
+ r1->hop_count++;
+ r1->hop_count = (h0->u.msg_type != DHCPV6_MSG_RELAY_FORW) ? 0 : r1->hop_count;
+
+ if (PREDICT_FALSE(r1->hop_count >= HOP_COUNT_LIMIT))
+ {
+ error0 = DHCPV6_RELAY_PKT_DROP_MAX_HOPS;
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_no_exceeding_max_hop++;
+ goto do_trace;
+ }
+
+
+ /* If relay-fwd and src address is site or global unicast address */
+ if (h0->u.msg_type == DHCPV6_MSG_RELAY_FORW &&
+ ((ip0->src_address.as_u8[0] & 0xe0) == 0x20 ||
+ (ip0->src_address.as_u8[0] & 0xfe) == 0xfc))
+ {
+ /* Set link address to zero */
+ r1->link_addr.as_u64[0] = 0;
+ r1->link_addr.as_u64[1] = 0;
+ goto link_address_set;
+ }
+
+ /* if receiving interface is unnumbered, use receiving interface
+ * IP address as link address, otherwise use the loopback interface
+ * IP address as link address.
+ */
+ original_sw_if_index = sw_if_index =
+ vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ swif = vnet_get_sw_interface (vnm, sw_if_index);
+ if (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
+ sw_if_index = swif->unnumbered_sw_if_index;
+
+ ia0 = ip6_interface_first_global_or_site_address(&ip6_main, sw_if_index);
+ if (ia0 == 0)
+ {
+ error0 = DHCPV6_PROXY_ERROR_NO_INTERFACE_ADDRESS;
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_no_interface_address++;
+ goto do_trace;
+ }
+
+ copy_ip6_address(&r1->link_addr, ia0);
+
+ link_address_set:
+ fl = vlib_buffer_get_free_list (vm, b0->free_list_index);
+
+ if ((b0->current_length+sizeof(*id1)+sizeof(*vss1)+sizeof(*cmac))
+ > fl->n_data_bytes)
+ {
+ error0 = DHCPV6_PROXY_ERROR_PKT_TOO_BIG;
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_too_big++;
+ goto do_trace;
+ }
+
+ id1 = (dhcpv6_int_id_t *) (((uword) ip1) + b0->current_length);
+ b0->current_length += (sizeof (*id1));
+
+ fib_index = im->fib_index_by_sw_if_index
+ [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
+ fib = vec_elt_at_index (im->fibs, fib_index);
+ fib_id = fib->table_id;
+
+ p_vss = hash_get (dpm->vss_index_by_vrf_id,
+ fib_id);
+ if (p_vss)
+ {
+ vss = pool_elt_at_index (dpm->vss, p_vss[0]);
+ oui1 = vss->vpn_id.oui;
+ fib_id1 = vss->vpn_id.fib_id;
+ }
+
+ id1->opt.option = clib_host_to_net_u16(DHCPV6_OPTION_INTERFACE_ID);
+ id1->opt.length = clib_host_to_net_u16(sizeof(original_sw_if_index));
+ id1->int_idx = clib_host_to_net_u32(original_sw_if_index);
+
+ u1->length =0;
+ if (h0->u.msg_type != DHCPV6_MSG_RELAY_FORW)
+ {
+ cmac = (dhcpv6_client_mac_t *) (((uword) ip1) + b0->current_length);
+ b0->current_length += (sizeof (*cmac));
+ cmac->opt.length =clib_host_to_net_u16(sizeof(*cmac) -
+ sizeof(cmac->opt));
+ cmac->opt.option = clib_host_to_net_u16(DHCPV6_OPTION_CLIENT_LINK_LAYER_ADDRESS);
+ cmac->link_type = clib_host_to_net_u16(1); // ethernet
+ memcpy(cmac->data, client_src_mac, 6);
+ u1->length += sizeof(*cmac);
+ }
+ if (oui1 || fib_id1) {
+ vss1 = (dhcpv6_vss_t *) (((uword) ip1) + b0->current_length);
+ b0->current_length += (sizeof (*vss1));
+ vss1->opt.length =clib_host_to_net_u16(sizeof(*vss1) -
+ sizeof(vss1->opt));
+ vss1->opt.option = clib_host_to_net_u16(DHCPV6_OPTION_VSS);
+ vss1->data[0] = 1; // type
+ vss1->data[1] = oui1>>16 & 0xff;
+ vss1->data[2] = oui1>>8 & 0xff;
+ vss1->data[3] = oui1 & 0xff;
+ vss1->data[4] = fib_id1>>24 & 0xff;
+ vss1->data[5] = fib_id1>>16 & 0xff;
+ vss1->data[6] = fib_id1>>8 & 0xff;
+ vss1->data[7] = fib_id1 & 0xff;
+ u1->length += sizeof(*vss1);
+ }
+
+ pkts_to_server++;
+ u1->checksum = 0;
+ u1->src_port = clib_host_to_net_u16(UDP_DST_PORT_dhcpv6_to_client);
+ u1->dst_port = clib_host_to_net_u16(UDP_DST_PORT_dhcpv6_to_server);
+
+ u1->length =
+ clib_host_to_net_u16( clib_net_to_host_u16(fwd_opt->length) +
+ sizeof(*r1) + sizeof(*fwd_opt) +
+ sizeof(*u1) + sizeof(*id1) + u1->length);
+
+ memset(ip1, 0, sizeof(*ip1));
+ ip1->ip_version_traffic_class_and_flow_label = 0x60;
+ ip1->payload_length = u1->length;
+ ip1->protocol = PROTO_UDP;
+ ip1->hop_limit = HOP_COUNT_LIMIT;
+ src = (dpm->dhcpv6_server.as_u64[0] || dpm->dhcpv6_server.as_u64[1]) ?
+ &dpm->dhcpv6_server : &dpm->all_dhcpv6_server_address;
+ copy_ip6_address(&ip1->dst_address, src);
+
+
+ ia0 = ip6_interface_first_global_or_site_address
+ (&ip6_main, vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+
+ src = (dpm->dhcpv6_src_address.as_u64[0] || dpm->dhcpv6_src_address.as_u64[1]) ?
+ &dpm->dhcpv6_src_address : ia0;
+ if (ia0 == 0)
+ {
+ error0 = DHCPV6_PROXY_ERROR_NO_SRC_ADDRESS;
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_DROP;
+ pkts_no_src_address++;
+ goto do_trace;
+ }
+
+ copy_ip6_address (&ip1->src_address, src);
+
+
+ u1->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b0, ip1,
+ &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ next0 = DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_LOOKUP;
+
+ do_trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ dhcpv6_proxy_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->which = 0; /* to server */
+ tr->error = error0;
+ tr->original_sw_if_index = original_sw_if_index;
+ tr->sw_if_index = sw_if_index;
+ if (DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_LOOKUP == next0)
+ copy_ip6_address((ip6_address_t *)&tr->packet_data[0], &dpm->dhcpv6_server);
+ }
+
+ do_enqueue:
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_RELAY_TO_CLIENT,
+ pkts_to_client);
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_RELAY_TO_SERVER,
+ pkts_to_server);
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_NO_INTERFACE_ADDRESS,
+ pkts_no_interface_address);
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_WRONG_MESSAGE_TYPE,
+ pkts_wrong_msg_type);
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_NO_SRC_ADDRESS,
+ pkts_no_src_address);
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_server_node.index,
+ DHCPV6_PROXY_ERROR_PKT_TOO_BIG,
+ pkts_too_big);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (dhcpv6_proxy_to_server_node) = {
+ .function = dhcpv6_proxy_to_server_input,
+ .name = "dhcpv6-proxy-to-server",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = DHCPV6_PROXY_N_ERROR,
+ .error_strings = dhcpv6_proxy_error_strings,
+
+ .n_next_nodes = DHCPV6_PROXY_TO_SERVER_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [DHCPV6_PROXY_TO_SERVER_INPUT_NEXT_##s] = n,
+ foreach_dhcpv6_proxy_to_server_input_next
+#undef _
+ },
+
+ .format_buffer = format_dhcpv6_proxy_header_with_length,
+ .format_trace = format_dhcpv6_proxy_trace,
+#if 0
+ .unformat_buffer = unformat_dhcpv6_proxy_header,
+#endif
+};
+
+static uword
+dhcpv6_proxy_to_client_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+
+ u32 n_left_from, * from;
+ ethernet_main_t *em = ethernet_get_main (vm);
+ dhcpv6_proxy_main_t * dpm = &dhcpv6_proxy_main;
+ vnet_main_t * vnm = vnet_get_main();
+ int bogus_length;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ udp_header_t * u0, *u1=0;
+ dhcpv6_relay_hdr_t * h0;
+ ip6_header_t * ip1 = 0, *ip0;
+ ip6_address_t _ia0, * ia0 = &_ia0;
+ ip6_address_t client_address;
+ ethernet_interface_t *ei0;
+ ethernet_header_t *mac0;
+ vnet_hw_interface_t *hi0;
+ vlib_frame_t *f0;
+ u32 * to_next0;
+ u32 sw_if_index = ~0;
+ u32 original_sw_if_index = ~0;
+ vnet_sw_interface_t *si0;
+ u32 error0 = (u32)~0;
+ vnet_sw_interface_t *swif;
+ dhcpv6_option_t *r0, *o;
+ u16 len = 0;
+ u32 svr_fib_index, svr_fib_id;
+ ip6_fib_t * svr_fib;
+ ip6_main_t * im = &ip6_main;
+
+ bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ if (DHCPV6_MSG_RELAY_REPL != h0->msg_type)
+ {
+ error0 = DHCPV6_PROXY_ERROR_WRONG_MESSAGE_TYPE;
+
+ drop_packet:
+ vlib_node_increment_counter (vm, dhcpv6_proxy_to_client_node.index,
+ error0, 1);
+
+ f0 = vlib_get_frame_to_node (vm, dpm->error_drop_node_index);
+ to_next0 = vlib_frame_vector_args (f0);
+ to_next0[0] = bi0;
+ f0->n_vectors = 1;
+ vlib_put_frame_to_node (vm, dpm->error_drop_node_index, f0);
+ goto do_trace;
+ }
+ /* hop count seems not need to be checked */
+ if (HOP_COUNT_LIMIT < h0->hop_count)
+ {
+ error0 = DHCPV6_RELAY_PKT_DROP_MAX_HOPS;
+ goto drop_packet;
+ }
+ u0 = (void *)h0 -(sizeof(*u0));
+ ip0 = (void *)u0 -(sizeof(*ip0));
+
+ vlib_buffer_advance (b0, sizeof(*h0));
+ o = r0 = vlib_buffer_get_current (b0);
+
+ /* Parse through TLVs looking for option 9 (DHCPV6_OPTION_INTERFACE_ID).
+ The interface-ID is the FIB number we need
+ to track down the client-facing interface */
+
+ while ((u8 *) o < (b0->data + b0->current_data + b0->current_length))
+ {
+ if (DHCPV6_OPTION_INTERFACE_ID == clib_net_to_host_u16(o->option))
+ {
+ if (clib_net_to_host_u16(o->length) == sizeof(sw_if_index))
+ sw_if_index = clib_net_to_host_u32(((dhcpv6_int_id_t*)o)->int_idx);
+ break;
+ }
+ o = (dhcpv6_option_t *) (((uword) o) + clib_net_to_host_u16(o->length) + sizeof(*o));
+ }
+
+ if ((u32)~0 == sw_if_index)
+ {
+ error0 = DHCPV6_PROXY_ERROR_NO_CIRCUIT_ID_OPTION;
+ goto drop_packet;
+ }
+
+ svr_fib_index = im->fib_index_by_sw_if_index
+ [vnet_buffer(b0)->sw_if_index[VLIB_RX]];
+ svr_fib = vec_elt_at_index (im->fibs, svr_fib_index);
+ svr_fib_id = svr_fib->table_id;
+
+ if (svr_fib_id != dpm->server_fib_index ||
+ ip0->src_address.as_u64[0] != dpm->dhcpv6_server.as_u64[0] ||
+ ip0->src_address.as_u64[1] != dpm->dhcpv6_server.as_u64[1])
+ {
+ //drop packet if not from server with configured address or FIB
+ error0 = DHCPV6_PROXY_ERROR_BAD_SVR_FIB_OR_ADDRESS;
+ goto drop_packet;
+ }
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = original_sw_if_index
+ = sw_if_index;
+
+ swif = vnet_get_sw_interface (vnm, original_sw_if_index);
+ if (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
+ sw_if_index = swif->unnumbered_sw_if_index;
+
+ vlib_buffer_advance (b0, sizeof(*r0));
+ /*
+ * udp_local hands us the DHCPV6 header, need udp hdr,
+ * ip hdr to relay to client
+ */
+ vlib_buffer_advance (b0, -(sizeof(*u1)));
+ u1 = vlib_buffer_get_current (b0);
+
+ vlib_buffer_advance (b0, -(sizeof(*ip1)));
+ ip1 = vlib_buffer_get_current (b0);
+
+ copy_ip6_address(&client_address, &h0->peer_addr);
+
+ ia0 = ip6_interface_first_address (&ip6_main, sw_if_index);
+ if (ia0 == 0)
+ {
+ error0 = DHCPV6_PROXY_ERROR_NO_INTERFACE_ADDRESS;
+ goto drop_packet;
+ }
+
+ len = clib_net_to_host_u16(r0->length);
+ memset(ip1, 0, sizeof(*ip1));
+ copy_ip6_address(&ip1->dst_address, &client_address);
+ u1->checksum = 0;
+ u1->src_port = clib_net_to_host_u16 (UDP_DST_PORT_dhcpv6_to_server);
+ u1->dst_port = clib_net_to_host_u16 (UDP_DST_PORT_dhcpv6_to_client);
+ u1->length = clib_host_to_net_u16 (len + sizeof(udp_header_t));
+
+ ip1->ip_version_traffic_class_and_flow_label =
+ ip0->ip_version_traffic_class_and_flow_label &
+ 0x00000fff;
+ ip1->payload_length = u1->length;
+ ip1->protocol = PROTO_UDP;
+ ip1->hop_limit = HOP_COUNT_LIMIT;
+ copy_ip6_address(&ip1->src_address, ia0);
+
+ u1->checksum = ip6_tcp_udp_icmp_compute_checksum(vm, b0, ip1,
+ &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ vlib_buffer_advance (b0, -(sizeof(ethernet_header_t)));
+ si0 = vnet_get_sw_interface (vnm, original_sw_if_index);
+ if (si0->type == VNET_SW_INTERFACE_TYPE_SUB)
+ vlib_buffer_advance (b0, -4 /* space for VLAN tag */);
+
+ mac0 = vlib_buffer_get_current (b0);
+
+ hi0 = vnet_get_sup_hw_interface (vnm, original_sw_if_index);
+ ei0 = pool_elt_at_index (em->interfaces, hi0->hw_instance);
+ memcpy (mac0->src_address, ei0->address, sizeof (ei0->address));
+ memset (&mac0->dst_address, 0xff, sizeof (mac0->dst_address));
+ mac0->type = (si0->type == VNET_SW_INTERFACE_TYPE_SUB) ?
+ clib_net_to_host_u16(0x8100) : clib_net_to_host_u16 (0x86dd);
+
+ if (si0->type == VNET_SW_INTERFACE_TYPE_SUB)
+ {
+ u32 * vlan_tag = (u32 *)(mac0+1);
+ u32 tmp;
+ tmp = (si0->sub.id << 16) | 0x0800;
+ *vlan_tag = clib_host_to_net_u32 (tmp);
+ }
+
+ /* $$$ consider adding a dynamic next to the graph node, for performance */
+ f0 = vlib_get_frame_to_node (vm, hi0->output_node_index);
+ to_next0 = vlib_frame_vector_args (f0);
+ to_next0[0] = bi0;
+ f0->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi0->output_node_index, f0);
+
+ do_trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ dhcpv6_proxy_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->which = 1; /* to client */
+ if (ia0)
+ copy_ip6_address((ip6_address_t*)tr->packet_data, ia0);
+ tr->error = error0;
+ tr->original_sw_if_index = original_sw_if_index;
+ tr->sw_if_index = sw_if_index;
+ }
+ }
+ return from_frame->n_vectors;
+
+}
+
+VLIB_REGISTER_NODE (dhcpv6_proxy_to_client_node) = {
+ .function = dhcpv6_proxy_to_client_input,
+ .name = "dhcpv6-proxy-to-client",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = DHCPV6_PROXY_N_ERROR,
+ .error_strings = dhcpv6_proxy_error_strings,
+ .format_buffer = format_dhcpv6_proxy_header_with_length,
+ .format_trace = format_dhcpv6_proxy_trace,
+#if 0
+ .unformat_buffer = unformat_dhcpv6_proxy_header,
+#endif
+};
+
+clib_error_t * dhcpv6_proxy_init (vlib_main_t * vm)
+{
+ dhcpv6_proxy_main_t * dm = &dhcpv6_proxy_main;
+ vlib_node_t * error_drop_node;
+
+ dm->vlib_main = vm;
+ dm->vnet_main = vnet_get_main();
+ error_drop_node = vlib_get_node_by_name (vm, (u8 *) "error-drop");
+ dm->error_drop_node_index = error_drop_node->index;
+
+ /* RFC says this is the dhcpv6 server address */
+ dm->all_dhcpv6_server_address.as_u64[0] = clib_host_to_net_u64 (0xFF05000000000000);
+ dm->all_dhcpv6_server_address.as_u64[1] = clib_host_to_net_u64 (0x00010003);
+
+ /* RFC says this is the server and agent address */
+ dm->all_dhcpv6_server_relay_agent_address.as_u64[0] = clib_host_to_net_u64 (0xFF02000000000000);
+ dm->all_dhcpv6_server_relay_agent_address.as_u64[1] = clib_host_to_net_u64 (0x00010002);
+
+ udp_register_dst_port (vm, UDP_DST_PORT_dhcpv6_to_client,
+ dhcpv6_proxy_to_client_node.index, 0 /* is_ip4 */);
+
+ udp_register_dst_port (vm, UDP_DST_PORT_dhcpv6_to_server,
+ dhcpv6_proxy_to_server_node.index, 0 /* is_ip6 */);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (dhcpv6_proxy_init);
+
+int dhcpv6_proxy_set_server (ip6_address_t *addr, ip6_address_t *src_address,
+ u32 fib_id, int insert_vss, int is_del)
+{
+ dhcpv6_proxy_main_t * dm = &dhcpv6_proxy_main;
+ ip6_main_t * im = &ip6_main;
+ uword * p;
+
+
+ if (is_del)
+ {
+ dm->dhcpv6_server.as_u64[0] = 0;
+ dm->dhcpv6_server.as_u64[1] = 0;
+ dm->server_fib_index = 0;
+ dm->dhcpv6_src_address.as_u64[0] = 0;
+ dm->dhcpv6_src_address.as_u64[1] = 0;
+ dm->insert_option = 0;
+ return 0;
+ }
+
+ if (addr->as_u64[0] == 0 &&
+ addr->as_u64[1] == 0 )
+ return VNET_API_ERROR_INVALID_DST_ADDRESS;
+
+ if (src_address->as_u64[0] == 0 &&
+ src_address->as_u64[1] == 0)
+ return VNET_API_ERROR_INVALID_SRC_ADDRESS;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (p == 0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ copy_ip6_address(&dm->dhcpv6_server, addr);
+ dm->server_fib_index = p[0];
+ copy_ip6_address(&dm->dhcpv6_src_address, src_address);
+ dm->insert_option = insert_vss;
+ return 0;
+}
+
+static clib_error_t *
+dhcpv6_proxy_set_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_address_t addr, src_addr;
+ int set_server = 0, set_src_address = 0, add_opt = 0;
+ u32 fib_id = 0;
+ int is_del = 0;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "server %U",
+ unformat_ip6_address, &addr))
+ set_server = 1;
+ else if (unformat(input, "src-address %U",
+ unformat_ip6_address, &src_addr))
+ set_src_address =1;
+ else if (unformat (input, "fib-id %d", &fib_id))
+ ;
+ else if (unformat (input, "add-option")
+ || unformat (input, "insert-option"))
+ add_opt = 1;
+ else if (unformat (input, "delete") ||
+ unformat (input, "del"))
+ is_del = 1;
+ else
+ break;
+ }
+
+ if (is_del || (set_server && set_src_address))
+ {
+ int rv;
+
+ rv = dhcpv6_proxy_set_server (&addr, &src_addr, fib_id,
+ add_opt, is_del);
+ switch (rv)
+ {
+ case 0:
+ return 0;
+
+ case -1:
+ return clib_error_return (0, "FIB id %d does not exist", fib_id);
+
+ default:
+ return clib_error_return (0, "BUG: rv %d", rv);
+ }
+ }
+ else
+ return clib_error_return (0, "parse error`%U'",
+ format_unformat_error, input);
+}
+
+VLIB_CLI_COMMAND (dhcpv6_proxy_set_command, static) = {
+ .path = "set dhcpv6 proxy",
+ .short_help = "set dhcpv6 proxy server <ipv6-addr> fib-id <fib-id> src-address <ipv6-addr>",
+ .function = dhcpv6_proxy_set_command_fn,
+};
+
+u8 * format_dhcpv6_proxy_server (u8 * s, va_list * args)
+{
+ dhcpv6_proxy_main_t * dm = va_arg (*args, dhcpv6_proxy_main_t *);
+ ip6_fib_t * f;
+ u32 fib_id = (u32)~0;
+
+ if (dm == 0)
+ {
+ s = format (s, "%=40s%=40s%=14s", "Server Address", "Source Address",
+ "Server FIB");
+ return s;
+ }
+
+ f = find_ip6_fib_by_table_index_or_id (&ip6_main, dm->server_fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX);
+ if (f)
+ fib_id = f->table_id;
+
+
+ s = format (s, "%=40U%=40U%=14u",
+ format_ip6_address, &dm->dhcpv6_server,
+ format_ip6_address, &dm->dhcpv6_src_address,
+ fib_id);
+ return s;
+}
+
+static clib_error_t *
+dhcpv6_proxy_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ dhcpv6_proxy_main_t * dm = &dhcpv6_proxy_main;
+
+ vlib_cli_output (vm, "%U", format_dhcpv6_proxy_server, 0 /* header line */);
+ vlib_cli_output (vm, "%U", format_dhcpv6_proxy_server, dm);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcpv6_proxy_show_command, static) = {
+ .path = "show dhcpv6 proxy",
+ .short_help = "Display dhcpv6 proxy info",
+ .function = dhcpv6_proxy_show_command_fn,
+};
+
+int dhcpv6_proxy_set_vss(u32 tbl_id,
+ u32 oui,
+ u32 fib_id,
+ int is_del)
+{
+ dhcpv6_proxy_main_t *dm = &dhcpv6_proxy_main;
+ u32 old_oui, old_fib_id;
+ uword *p;
+ dhcpv6_vss_info *v;
+
+ p = hash_get (dm->vss_index_by_vrf_id, tbl_id);
+
+ if (p) {
+ v = pool_elt_at_index (dm->vss, p[0]);
+ if (!v)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ old_oui = v->vpn_id.oui;
+ old_fib_id = v->vpn_id.fib_id;
+
+ if (is_del)
+ {
+ if (old_oui == oui &&
+ old_fib_id == fib_id )
+ {
+ pool_put(dm->vss, v);
+ hash_unset (dm->vss_index_by_vrf_id, tbl_id);
+ return 0;
+ }
+ else
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ }
+
+ pool_put(dm->vss, v);
+ hash_unset (dm->vss_index_by_vrf_id, tbl_id);
+ } else if (is_del)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ pool_get (dm->vss, v);
+ memset (v, ~0, sizeof (*v));
+ v->vpn_id.fib_id = fib_id;
+ v->vpn_id.oui = oui;
+ hash_set (dm->vss_index_by_vrf_id, tbl_id, v - dm->vss);
+
+ return 0;
+}
+
+
+static clib_error_t *
+dhcpv6_vss_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int is_del = 0, got_new_vss=0;
+ u32 oui=0;
+ u32 fib_id=0, tbl_id=~0;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "oui %d", &oui))
+ got_new_vss = 1;
+ else if (unformat (input, "vpn-id %d", &fib_id))
+ got_new_vss = 1;
+ else if (unformat (input, "table %d", &tbl_id))
+ got_new_vss = 1;
+ else if (unformat(input, "delete") || unformat(input, "del"))
+ is_del = 1;
+ else
+ break;
+ }
+
+ if (tbl_id ==~0)
+ return clib_error_return (0, "no table ID specified.");
+
+ if (is_del || got_new_vss)
+ {
+ int rv;
+
+ rv = dhcpv6_proxy_set_vss(tbl_id, oui, fib_id, is_del);
+ switch (rv)
+ {
+ case 0:
+ return 0;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "vss info (oui:%d, vpn-id:%d) not found in table %d.",
+ oui, fib_id, tbl_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "vss for table %d not found in pool.",
+ tbl_id);
+
+ default:
+ return clib_error_return (0, "BUG: rv %d", rv);
+ }
+ }
+ else
+ return clib_error_return (0, "parse error`%U'",
+ format_unformat_error, input);
+
+}
+
+VLIB_CLI_COMMAND (dhcpv6_proxy_vss_command, static) = {
+ .path = "set dhcpv6 vss",
+ .short_help = "set dhcpv6 vss table <table-id> oui <oui> vpn-idx <vpn-idx>",
+ .function = dhcpv6_vss_command_fn,
+};
+
+static clib_error_t *
+dhcpv6_vss_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+
+{
+ dhcpv6_proxy_main_t * dm = &dhcpv6_proxy_main;
+ dhcpv6_vss_info *v;
+ u32 oui;
+ u32 fib_id;
+ u32 tbl_id;
+ uword index;
+
+ vlib_cli_output (vm, "%=6s%=6s%=12s","Table", "OUI", "VPN ID");
+ hash_foreach (tbl_id, index, dm->vss_index_by_vrf_id,
+ ({
+ v = pool_elt_at_index (dm->vss, index);
+ oui = v->vpn_id.oui;
+ fib_id = v->vpn_id.fib_id;
+ vlib_cli_output (vm, "%=6d%=6d%=12d",
+ tbl_id, oui, fib_id);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcpv6_proxy_vss_show_command, static) = {
+ .path = "show dhcpv6 vss",
+ .short_help = "show dhcpv6 VSS",
+ .function = dhcpv6_vss_show_command_fn,
+};
+
+static clib_error_t *
+dhcpv6_link_address_show_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+
+{
+ dhcpv6_proxy_main_t *dm = &dhcpv6_proxy_main;
+ vnet_main_t *vnm = vnet_get_main();
+ u32 sw_if_index0=0, sw_if_index;
+ ip6_address_t *ia0;
+ vnet_sw_interface_t *swif;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+
+ if (unformat(input, "%U",
+ unformat_vnet_sw_interface, dm->vnet_main, &sw_if_index0))
+ {
+ swif = vnet_get_sw_interface (vnm, sw_if_index0);
+ sw_if_index = (swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED) ?
+ swif->unnumbered_sw_if_index : sw_if_index0;
+ ia0 = ip6_interface_first_address(&ip6_main, sw_if_index);
+ if (ia0)
+ {
+ vlib_cli_output (vm, "%=20s%=48s", "interface", "link-address");
+
+ vlib_cli_output (vm, "%=20U%=48U",
+ format_vnet_sw_if_index_name, dm->vnet_main, sw_if_index0,
+ format_ip6_address, ia0);
+ } else
+ vlib_cli_output (vm, "%=34s%=20U", "No IPv6 address configured on",
+ format_vnet_sw_if_index_name, dm->vnet_main, sw_if_index);
+ } else
+ break;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (dhcpv6_proxy_address_show_command, static) = {
+ .path = "show dhcpv6 link-address interface",
+ .short_help = "show dhcpv6 link-address interface <interface>",
+ .function = dhcpv6_link_address_show_command_fn,
+};
diff --git a/vnet/vnet/dpdk_replication.h b/vnet/vnet/dpdk_replication.h
new file mode 100644
index 00000000000..b25558f9b89
--- /dev/null
+++ b/vnet/vnet/dpdk_replication.h
@@ -0,0 +1,107 @@
+#ifndef __included_dpdk_replication_h__
+#define __included_dpdk_replication_h__
+#include <vnet/devices/dpdk/dpdk.h>
+
+/*
+ * vlib_dpdk_clone_buffer - clone a buffer
+ * for port mirroring, lawful intercept, etc.
+ * rte_pktmbuf_clone (...) requires that the forwarding path
+ * not touch any of the cloned data. The hope is that we'll
+ * figure out how to relax that restriction.
+ *
+ * For the moment, copy packet data.
+ */
+
+static inline vlib_buffer_t *
+vlib_dpdk_clone_buffer (vlib_main_t * vm, vlib_buffer_t * b)
+{
+ u32 new_buffers_needed = 1;
+ unsigned socket_id = rte_socket_id();
+ struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id];
+ struct rte_mbuf *rte_mbufs[5];
+ vlib_buffer_free_list_t * fl;
+ vlib_buffer_t * rv;
+ u8 * copy_src, * copy_dst;
+ vlib_buffer_t *src_buf, *dst_buf;
+
+ fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ if (PREDICT_FALSE(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ vlib_buffer_t *tmp = b;
+ int i;
+
+ while (tmp->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ new_buffers_needed ++;
+ tmp = vlib_get_buffer (vm, tmp->next_buffer);
+ }
+
+ /* Should never happen... */
+ if (PREDICT_FALSE(new_buffers_needed > ARRAY_LEN(rte_mbufs)))
+ {
+ clib_warning ("need %d buffers", new_buffers_needed);
+ return 0;
+ }
+
+ if (rte_mempool_get_bulk (rmp, (void **)rte_mbufs,
+ new_buffers_needed) < 0)
+ return 0;
+
+ src_buf = b;
+ rv = dst_buf = (vlib_buffer_t *)(rte_mbufs[0] + 1);
+ vlib_buffer_init_for_free_list (dst_buf, fl);
+ copy_src = b->data + src_buf->current_data;
+ copy_dst = dst_buf->data + src_buf->current_data;
+
+ for (i = 0; i < new_buffers_needed; i++)
+ {
+ memcpy (copy_src, copy_dst, src_buf->current_length);
+ dst_buf->current_data = src_buf->current_data;
+ dst_buf->current_length = src_buf->current_length;
+ dst_buf->flags = src_buf->flags;
+
+ if (i == 0)
+ {
+ dst_buf->total_length_not_including_first_buffer =
+ src_buf->total_length_not_including_first_buffer;
+ vnet_buffer(dst_buf)->sw_if_index[VLIB_RX] =
+ vnet_buffer(src_buf)->sw_if_index[VLIB_RX];
+ vnet_buffer(dst_buf)->sw_if_index[VLIB_TX] =
+ vnet_buffer(src_buf)->sw_if_index[VLIB_TX];
+ vnet_buffer(dst_buf)->l2 = vnet_buffer(b)->l2;
+ }
+
+ if (i < new_buffers_needed - 1)
+ {
+ src_buf = vlib_get_buffer (vm, src_buf->next_buffer);
+ dst_buf = (vlib_buffer_t *)(rte_mbufs[i+1] + 1);
+ vlib_buffer_init_for_free_list (dst_buf, fl);
+ copy_src = src_buf->data;
+ copy_dst = dst_buf->data;
+ }
+ }
+ return rv;
+ }
+
+ if (rte_mempool_get_bulk (rmp, (void **)rte_mbufs, 1) < 0)
+ return 0;
+
+ rv = (vlib_buffer_t *)(rte_mbufs[0] + 1);
+ vlib_buffer_init_for_free_list (rv, fl);
+
+ memcpy(rv->data + b->current_data, b->data + b->current_data,
+ b->current_length);
+ rv->current_data = b->current_data;
+ rv->current_length = b->current_length;
+ vnet_buffer(rv)->sw_if_index[VLIB_RX] =
+ vnet_buffer(b)->sw_if_index[VLIB_RX];
+ vnet_buffer(rv)->sw_if_index[VLIB_TX] =
+ vnet_buffer(b)->sw_if_index[VLIB_TX];
+ vnet_buffer(rv)->l2 = vnet_buffer(b)->l2;
+
+ return (rv);
+}
+
+
+#endif /* __included_dpdk_replication_h__ */
diff --git a/vnet/vnet/ethernet/arp.c b/vnet/vnet/ethernet/arp.c
new file mode 100644
index 00000000000..446c76a1e41
--- /dev/null
+++ b/vnet/vnet/ethernet/arp.c
@@ -0,0 +1,1853 @@
+/*
+ * ethernet/arp.c: IP v4 ARP node
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/arp_packet.h>
+#include <vnet/l2/l2_input.h>
+#include <vppinfra/mhash.h>
+
+typedef struct {
+ u32 sw_if_index;
+ u32 fib_index;
+ ip4_address_t ip4_address;
+} ethernet_arp_ip4_key_t;
+
+typedef struct {
+ ethernet_arp_ip4_key_t key;
+ u8 ethernet_address[6];
+
+ u16 flags;
+#define ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC (1 << 0)
+
+ u64 cpu_time_last_updated;
+} ethernet_arp_ip4_entry_t;
+
+typedef struct {
+ u32 lo_addr;
+ u32 hi_addr;
+ u32 fib_index;
+} ethernet_proxy_arp_t;
+
+typedef struct {
+ u32 next_index;
+ uword node_index;
+ uword type_opaque;
+ uword data;
+ /* Used for arp event notification only */
+ void * data_callback;
+ u32 pid;
+} pending_resolution_t;
+
+typedef struct {
+ /* Hash tables mapping name to opcode. */
+ uword * opcode_by_name;
+
+ /* lite beer "glean" adjacency handling */
+ uword * pending_resolutions_by_address;
+ pending_resolution_t * pending_resolutions;
+
+ /* Mac address change notification */
+ uword * mac_changes_by_address;
+ pending_resolution_t * mac_changes;
+
+ u32 * arp_input_next_index_by_hw_if_index;
+
+ ethernet_arp_ip4_entry_t * ip4_entry_pool;
+
+ mhash_t ip4_entry_by_key;
+
+ /* ARP attack mitigation */
+ u32 arp_delete_rotor;
+ u32 limit_arp_cache_size;
+
+ /* Proxy arp vector */
+ ethernet_proxy_arp_t * proxy_arps;
+} ethernet_arp_main_t;
+
+static ethernet_arp_main_t ethernet_arp_main;
+
+static u8 * format_ethernet_arp_hardware_type (u8 * s, va_list * va)
+{
+ ethernet_arp_hardware_type_t h = va_arg (*va, ethernet_arp_hardware_type_t);
+ char * t = 0;
+ switch (h)
+ {
+#define _(n,f) case n: t = #f; break;
+ foreach_ethernet_arp_hardware_type;
+#undef _
+
+ default:
+ return format (s, "unknown 0x%x", h);
+ }
+
+ return format (s, "%s", t);
+}
+
+static u8 * format_ethernet_arp_opcode (u8 * s, va_list * va)
+{
+ ethernet_arp_opcode_t o = va_arg (*va, ethernet_arp_opcode_t);
+ char * t = 0;
+ switch (o)
+ {
+#define _(f) case ETHERNET_ARP_OPCODE_##f: t = #f; break;
+ foreach_ethernet_arp_opcode;
+#undef _
+
+ default:
+ return format (s, "unknown 0x%x", o);
+ }
+
+ return format (s, "%s", t);
+}
+
+static uword
+unformat_ethernet_arp_opcode_host_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ int * result = va_arg (*args, int *);
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ int x, i;
+
+ /* Numeric opcode. */
+ if (unformat (input, "0x%x", &x)
+ || unformat (input, "%d", &x))
+ {
+ if (x >= (1 << 16))
+ return 0;
+ *result = x;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ am->opcode_by_name, &i))
+ {
+ *result = i;
+ return 1;
+ }
+
+ return 0;
+}
+
+static uword
+unformat_ethernet_arp_opcode_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ int * result = va_arg (*args, int *);
+ if (! unformat_user (input, unformat_ethernet_arp_opcode_host_byte_order, result))
+ return 0;
+
+ *result = clib_host_to_net_u16 ((u16) *result);
+ return 1;
+}
+
+static u8 * format_ethernet_arp_header (u8 * s, va_list * va)
+{
+ ethernet_arp_header_t * a = va_arg (*va, ethernet_arp_header_t *);
+ u32 max_header_bytes = va_arg (*va, u32);
+ uword indent;
+ u16 l2_type, l3_type;
+
+ if (max_header_bytes != 0 && sizeof (a[0]) > max_header_bytes)
+ return format (s, "ARP header truncated");
+
+ l2_type = clib_net_to_host_u16 (a->l2_type);
+ l3_type = clib_net_to_host_u16 (a->l3_type);
+
+ indent = format_get_indent (s);
+
+ s = format (s, "%U, type %U/%U, address size %d/%d",
+ format_ethernet_arp_opcode, clib_net_to_host_u16 (a->opcode),
+ format_ethernet_arp_hardware_type, l2_type,
+ format_ethernet_type, l3_type,
+ a->n_l2_address_bytes, a->n_l3_address_bytes);
+
+ if (l2_type == ETHERNET_ARP_HARDWARE_TYPE_ethernet
+ && l3_type == ETHERNET_TYPE_IP4)
+ {
+ s = format (s, "\n%U%U/%U -> %U/%U",
+ format_white_space, indent,
+ format_ethernet_address, a->ip4_over_ethernet[0].ethernet,
+ format_ip4_address, &a->ip4_over_ethernet[0].ip4,
+ format_ethernet_address, a->ip4_over_ethernet[1].ethernet,
+ format_ip4_address, &a->ip4_over_ethernet[1].ip4);
+ }
+ else
+ {
+ uword n2 = a->n_l2_address_bytes;
+ uword n3 = a->n_l3_address_bytes;
+ s = format (s, "\n%U%U/%U -> %U/%U",
+ format_white_space, indent,
+ format_hex_bytes, a->data + 0*n2 + 0*n3, n2,
+ format_hex_bytes, a->data + 1*n2 + 0*n3, n3,
+ format_hex_bytes, a->data + 1*n2 + 1*n3, n2,
+ format_hex_bytes, a->data + 2*n2 + 1*n3, n3);
+ }
+
+ return s;
+}
+
+static u8 * format_ethernet_arp_ip4_entry (u8 * s, va_list * va)
+{
+ vnet_main_t * vnm = va_arg (*va, vnet_main_t *);
+ ethernet_arp_ip4_entry_t * e = va_arg (*va, ethernet_arp_ip4_entry_t *);
+ vnet_sw_interface_t * si;
+ ip4_fib_t * fib;
+
+ if (! e)
+ return format (s, "%=12s%=6s%=16s%=4s%=20s%=24s", "Time", "FIB", "IP4",
+ "Static", "Ethernet", "Interface");
+
+ fib = find_ip4_fib_by_table_index_or_id (&ip4_main, e->key.fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX);
+ si = vnet_get_sw_interface (vnm, e->key.sw_if_index);
+ s = format (s, "%=12U%=6u%=16U%=4s%=20U%=25U",
+ format_vlib_cpu_time, vnm->vlib_main, e->cpu_time_last_updated,
+ fib->table_id,
+ format_ip4_address, &e->key.ip4_address,
+ (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC) ? "S" : "",
+ format_ethernet_address, e->ethernet_address,
+ format_vnet_sw_interface_name, vnm, si);
+
+ return s;
+}
+
+typedef struct {
+ u8 packet_data[64];
+} ethernet_arp_input_trace_t;
+
+static u8 * format_ethernet_arp_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ethernet_arp_input_trace_t * t = va_arg (*va, ethernet_arp_input_trace_t *);
+
+ s = format (s, "%U",
+ format_ethernet_arp_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+clib_error_t *
+ethernet_arp_sw_interface_up_down (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 flags)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ethernet_arp_ip4_entry_t * e;
+
+ if (! (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ {
+ u32 i, * to_delete = 0;
+
+ pool_foreach (e, am->ip4_entry_pool, ({
+ if (e->key.sw_if_index == sw_if_index)
+ vec_add1 (to_delete, e - am->ip4_entry_pool);
+ }));
+
+ for (i = 0; i < vec_len (to_delete); i++)
+ {
+ ethernet_arp_ip4_over_ethernet_address_t delme;
+ e = pool_elt_at_index (am->ip4_entry_pool, to_delete[i]);
+
+ memcpy (&delme.ethernet, e->ethernet_address, 6);
+ delme.ip4.as_u32 = e->key.ip4_address.as_u32;
+
+ vnet_arp_unset_ip4_over_ethernet (vnm, e->key.sw_if_index,
+ e->key.fib_index, &delme);
+ }
+
+ vec_free (to_delete);
+ }
+
+ return 0;
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ethernet_arp_sw_interface_up_down);
+
+static int
+vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void * a_arg,
+ int is_static);
+
+static int
+vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void * a_arg);
+
+typedef struct {
+ u32 sw_if_index;
+ u32 fib_index;
+ ethernet_arp_ip4_over_ethernet_address_t a;
+ int is_static;
+ int is_remove; /* set is_remove=1 to clear arp entry */
+} vnet_arp_set_ip4_over_ethernet_rpc_args_t;
+
+static void set_ip4_over_ethernet_rpc_callback
+( vnet_arp_set_ip4_over_ethernet_rpc_args_t * a)
+{
+ vnet_main_t * vm = vnet_get_main();
+ ASSERT(os_get_cpu_number() == 0);
+
+ if (a->is_remove)
+ vnet_arp_unset_ip4_over_ethernet_internal(vm,
+ a->sw_if_index,
+ a->fib_index,
+ &(a->a));
+ else
+ vnet_arp_set_ip4_over_ethernet_internal (vm,
+ a->sw_if_index,
+ a->fib_index,
+ &(a->a),
+ a->is_static);
+}
+
+int
+vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void * a_arg,
+ int is_static)
+{
+ ethernet_arp_ip4_over_ethernet_address_t * a = a_arg;
+ vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+ args.sw_if_index = sw_if_index;
+ args.fib_index = fib_index;
+ args.is_static = is_static;
+ args.is_remove = 0;
+ memcpy (&args.a, a, sizeof (*a));
+
+ vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+ (u8 *) &args, sizeof (args));
+ return 0;
+}
+
+int
+vnet_arp_set_ip4_over_ethernet_internal (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void * a_arg,
+ int is_static)
+{
+ ethernet_arp_ip4_key_t k;
+ ethernet_arp_ip4_entry_t * e = 0;
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ethernet_arp_ip4_over_ethernet_address_t * a = a_arg;
+ vlib_main_t * vm = vlib_get_main();
+ ip4_main_t * im = &ip4_main;
+ int make_new_arp_cache_entry=1;
+ uword * p;
+ ip4_add_del_route_args_t args;
+ ip_adjacency_t adj;
+ pending_resolution_t * pr, * mc;
+
+ u32 next_index;
+
+ fib_index = (fib_index != (u32)~0)
+ ? fib_index : im->fib_index_by_sw_if_index[sw_if_index];
+
+ k.sw_if_index = sw_if_index;
+ k.ip4_address = a->ip4;
+ k.fib_index = fib_index;
+
+ p = mhash_get (&am->ip4_entry_by_key, &k);
+ if (p)
+ {
+ e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
+
+ /* Refuse to over-write static arp. */
+ if (e->flags & ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC)
+ return -2;
+ make_new_arp_cache_entry = 0;
+ }
+
+ /* Note: always install the route. It might have been deleted */
+ memset(&adj, 0, sizeof(adj));
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP4,
+ sw_if_index,
+ ip4_rewrite_node.index,
+ a->ethernet, /* destination address */
+ &adj.rewrite_header,
+ sizeof (adj.rewrite_data));
+
+ args.table_index_or_table_id = fib_index;
+ args.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_ADD | IP4_ROUTE_FLAG_NEIGHBOR;
+ args.dst_address = a->ip4;
+ args.dst_address_length = 32;
+ args.adj_index = ~0;
+ args.add_adj = &adj;
+ args.n_add_adj = 1;
+
+ ip4_add_del_route (im, &args);
+ if (make_new_arp_cache_entry)
+ {
+ pool_get (am->ip4_entry_pool, e);
+ mhash_set (&am->ip4_entry_by_key, &k,
+ e - am->ip4_entry_pool,
+ /* old value */ 0);
+ e->key = k;
+ }
+
+ /* Update time stamp and ethernet address. */
+ memcpy (e->ethernet_address, a->ethernet, sizeof (e->ethernet_address));
+ e->cpu_time_last_updated = clib_cpu_time_now ();
+ if (is_static)
+ e->flags |= ETHERNET_ARP_IP4_ENTRY_FLAG_STATIC;
+
+ /* Customer(s) waiting for this address to be resolved? */
+ p = hash_get (am->pending_resolutions_by_address, a->ip4.as_u32);
+ if (p)
+ {
+ next_index = p[0];
+
+ while (next_index != (u32)~0)
+ {
+ pr = pool_elt_at_index (am->pending_resolutions, next_index);
+ vlib_process_signal_event (vm, pr->node_index,
+ pr->type_opaque,
+ pr->data);
+ next_index = pr->next_index;
+ pool_put (am->pending_resolutions, pr);
+ }
+
+ hash_unset (am->pending_resolutions_by_address, a->ip4.as_u32);
+ }
+
+ /* Customer(s) requesting ARP event for this address? */
+ p = hash_get (am->mac_changes_by_address, a->ip4.as_u32);
+ if (p)
+ {
+ next_index = p[0];
+
+ while (next_index != (u32)~0)
+ {
+ int (*fp)(u32, u8 *, u32, u32);
+ int rv = 1;
+ mc = pool_elt_at_index (am->mac_changes, next_index);
+ fp = mc->data_callback;
+
+ /* Call the user's data callback, return 1 to suppress dup events */
+ if (fp)
+ rv = (*fp)(mc->data, a->ethernet, sw_if_index, 0);
+
+ /*
+ * Signal the resolver process, as long as the user
+ * says they want to be notified
+ */
+ if (rv == 0)
+ vlib_process_signal_event (vm, mc->node_index,
+ mc->type_opaque,
+ mc->data);
+ next_index = mc->next_index;
+ }
+ }
+
+ return 0;
+}
+
+void vnet_register_ip4_arp_resolution_event (vnet_main_t * vnm,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ip4_address_t * address = address_arg;
+ uword * p;
+ pending_resolution_t * pr;
+
+ pool_get (am->pending_resolutions, pr);
+
+ pr->next_index = ~0;
+ pr->node_index = node_index;
+ pr->type_opaque = type_opaque;
+ pr->data = data;
+ pr->data_callback = 0;
+
+ p = hash_get (am->pending_resolutions_by_address, address->as_u32);
+ if (p)
+ {
+ /* Insert new resolution at the head of the list */
+ pr->next_index = p[0];
+ hash_unset (am->pending_resolutions_by_address, address->as_u32);
+ }
+
+ hash_set (am->pending_resolutions_by_address, address->as_u32,
+ pr - am->pending_resolutions);
+}
+
+int vnet_add_del_ip4_arp_change_event (vnet_main_t * vnm,
+ void * data_callback,
+ u32 pid,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data, int is_add)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ip4_address_t * address = address_arg;
+ uword * p;
+ pending_resolution_t * mc;
+ void (*fp)(u32, u8 *) = data_callback;
+
+ if (is_add)
+ {
+ pool_get (am->mac_changes, mc);
+
+ mc->next_index = ~0;
+ mc->node_index = node_index;
+ mc->type_opaque = type_opaque;
+ mc->data = data;
+ mc->data_callback = data_callback;
+ mc->pid = pid;
+
+ p = hash_get (am->mac_changes_by_address, address->as_u32);
+ if (p)
+ {
+ /* Insert new resolution at the head of the list */
+ mc->next_index = p[0];
+ hash_unset (am->mac_changes_by_address, address->as_u32);
+ }
+
+ hash_set (am->mac_changes_by_address, address->as_u32,
+ mc - am->mac_changes);
+ return 0;
+ }
+ else
+ {
+ u32 index;
+ pending_resolution_t * mc_last = 0;
+
+ p = hash_get (am->mac_changes_by_address, address->as_u32);
+ if (p == 0)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ index = p[0];
+
+ while (index != (u32)~0)
+ {
+ mc = pool_elt_at_index (am->mac_changes, index);
+ if (mc->node_index == node_index &&
+ mc->type_opaque == type_opaque &&
+ mc->pid == pid)
+ {
+ /* Clients may need to clean up pool entries, too */
+ if (fp)
+ (*fp)(mc->data, 0 /* no new mac addrs */);
+ if (index == p[0])
+ {
+ hash_unset (am->mac_changes_by_address, address->as_u32);
+ if (mc->next_index != ~0)
+ hash_set (am->mac_changes_by_address, address->as_u32,
+ mc->next_index);
+ pool_put (am->mac_changes, mc);
+ return 0;
+ }
+ else
+ {
+ ASSERT(mc_last);
+ mc_last->next_index = mc->next_index;
+ pool_put (am->mac_changes, mc);
+ return 0;
+ }
+ }
+ mc_last = mc;
+ index = mc->next_index;
+ }
+
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ }
+}
+
+/* Either we drop the packet or we send a reply to the sender. */
+typedef enum {
+ ARP_INPUT_NEXT_DROP,
+ ARP_INPUT_N_NEXT,
+} arp_input_next_t;
+
+#define foreach_ethernet_arp_error \
+ _ (replies_sent, "ARP replies sent") \
+ _ (l2_type_not_ethernet, "L2 type not ethernet") \
+ _ (l3_type_not_ip4, "L3 type not IP4") \
+ _ (l3_src_address_not_local, "IP4 source address not local to subnet") \
+ _ (l3_dst_address_not_local, "IP4 destination address not local to subnet") \
+ _ (l3_src_address_is_local, "IP4 source address matches local interface") \
+ _ (l3_src_address_learned, "ARP request IP4 source address learned") \
+ _ (replies_received, "ARP replies received") \
+ _ (opcode_not_request, "ARP opcode not request") \
+ _ (proxy_arp_replies_sent, "Proxy ARP replies sent") \
+ _ (l2_address_mismatch, "ARP hw addr does not match L2 frame src addr") \
+ _ (missing_interface_address, "ARP missing interface address") \
+ _ (gratuitous_arp, "ARP probe or announcement dropped") \
+
+typedef enum {
+#define _(sym,string) ETHERNET_ARP_ERROR_##sym,
+ foreach_ethernet_arp_error
+#undef _
+ ETHERNET_ARP_N_ERROR,
+} ethernet_arp_input_error_t;
+
+/* get first interface address */
+ip4_address_t *
+ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index,
+ ip_interface_address_t ** result_ia)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip4_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+ result = a;
+ break;
+ }));
+ if (result_ia)
+ *result_ia = result ? ia : 0;
+ return result;
+}
+
+static void unset_random_arp_entry (void)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ethernet_arp_ip4_entry_t * e;
+ vnet_main_t * vnm = vnet_get_main();
+ ethernet_arp_ip4_over_ethernet_address_t delme;
+ u32 index;
+
+ index = pool_next_index (am->ip4_entry_pool, am->arp_delete_rotor);
+ am->arp_delete_rotor = index;
+
+ /* Try again from elt 0, could happen if an intfc goes down */
+ if (index == ~0)
+ {
+ index = pool_next_index (am->ip4_entry_pool, am->arp_delete_rotor);
+ am->arp_delete_rotor = index;
+ }
+
+ /* Nothing left in the pool */
+ if (index == ~0)
+ return;
+
+ e = pool_elt_at_index (am->ip4_entry_pool, index);
+
+ memcpy (&delme.ethernet, e->ethernet_address, 6);
+ delme.ip4.as_u32 = e->key.ip4_address.as_u32;
+
+ vnet_arp_unset_ip4_over_ethernet (vnm, e->key.sw_if_index,
+ e->key.fib_index, &delme);
+}
+
+static u32 arp_unnumbered (vlib_buffer_t * p0,
+ u32 pi0,
+ ethernet_header_t * eth0,
+ ip_interface_address_t * ifa0)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ vlib_main_t * vm = vlib_get_main();
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * vim = &vnm->interface_main;
+ vnet_sw_interface_t * si;
+ vnet_hw_interface_t * hi;
+ u32 unnum_src_sw_if_index;
+ u32 * broadcast_swifs = 0;
+ u32 * buffers = 0;
+ u32 n_alloc = 0;
+ vlib_buffer_t * b0;
+ int i;
+ u8 dst_mac_address[6];
+ i16 header_size;
+ ethernet_arp_header_t * arp0;
+
+ /* Save the dst mac address */
+ memcpy(dst_mac_address, eth0->dst_address, sizeof (dst_mac_address));
+
+ /* Figure out which sw_if_index supplied the address */
+ unnum_src_sw_if_index = ifa0->sw_if_index;
+
+ /* Track down all users of the unnumbered source */
+ pool_foreach (si, vim->sw_interfaces,
+ ({
+ if (si->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED &&
+ (si->unnumbered_sw_if_index == unnum_src_sw_if_index))
+ {
+ vec_add1 (broadcast_swifs, si->sw_if_index);
+ }
+ }));
+
+
+ ASSERT (vec_len(broadcast_swifs));
+
+ /* Allocate buffering if we need it */
+ if (vec_len(broadcast_swifs) > 1)
+ {
+ vec_validate (buffers, vec_len(broadcast_swifs)-2);
+ n_alloc = vlib_buffer_alloc (vm, buffers, vec_len(buffers));
+ _vec_len (buffers) = n_alloc;
+ for (i = 0; i < n_alloc; i++)
+ {
+ b0 = vlib_get_buffer (vm, buffers[i]);
+
+ /* xerox (partially built) ARP pkt */
+ memcpy (b0->data, p0->data, p0->current_length + p0->current_data);
+ b0->current_data = p0->current_data;
+ b0->current_length = p0->current_length;
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] =
+ vnet_buffer(p0)->sw_if_index[VLIB_RX];
+ }
+ }
+
+ vec_insert (buffers, 1, 0);
+ buffers[0] = pi0;
+
+ for (i = 0; i < vec_len(buffers); i++)
+ {
+ b0 = vlib_get_buffer(vm, buffers[i]);
+ arp0 = vlib_buffer_get_current (b0);
+
+ hi = vnet_get_sup_hw_interface (vnm, broadcast_swifs[i]);
+ si = vnet_get_sw_interface (vnm, broadcast_swifs[i]);
+
+ /* For decoration, most likely */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = hi->sw_if_index;
+
+ /* Fix ARP pkt src address */
+ memcpy (arp0->ip4_over_ethernet[0].ethernet, hi->hw_address, 6);
+
+ /* Build L2 encaps for this swif */
+ header_size = sizeof (ethernet_header_t);
+ if (si->sub.eth.flags.one_tag)
+ header_size += 4;
+ else if (si->sub.eth.flags.two_tags)
+ header_size += 8;
+
+ vlib_buffer_advance (b0, -header_size);
+ eth0 = vlib_buffer_get_current (b0);
+
+ if (si->sub.eth.flags.one_tag) {
+ ethernet_vlan_header_t * outer = (void *) (eth0 + 1);
+
+ eth0->type = si->sub.eth.flags.dot1ad ?
+ clib_host_to_net_u16 (ETHERNET_TYPE_DOT1AD) :
+ clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ outer->priority_cfi_and_id =
+ clib_host_to_net_u16 (si->sub.eth.outer_vlan_id);
+ outer->type = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
+
+ } else if (si->sub.eth.flags.two_tags) {
+ ethernet_vlan_header_t * outer = (void *) (eth0 + 1);
+ ethernet_vlan_header_t * inner = (void *) (outer + 1);
+
+ eth0->type = si->sub.eth.flags.dot1ad ?
+ clib_host_to_net_u16 (ETHERNET_TYPE_DOT1AD) :
+ clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ outer->priority_cfi_and_id =
+ clib_host_to_net_u16 (si->sub.eth.outer_vlan_id);
+ outer->type = clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ inner->priority_cfi_and_id =
+ clib_host_to_net_u16 (si->sub.eth.inner_vlan_id);
+ inner->type = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
+
+ } else {
+ eth0->type = clib_host_to_net_u16 (ETHERNET_TYPE_ARP);
+ }
+
+ /* Restore the original dst address, set src address */
+ memcpy (eth0->dst_address, dst_mac_address, sizeof (eth0->dst_address));
+ memcpy (eth0->src_address, hi->hw_address, sizeof (eth0->src_address));
+
+ /* Transmit replicas */
+ if (i > 0)
+ {
+ vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
+ u32 * to_next = vlib_frame_vector_args (f);
+ to_next[0] = buffers[i];
+ f->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi->output_node_index, f);
+ }
+ }
+
+ hi = vnet_get_sup_hw_interface (vnm, broadcast_swifs[0]);
+
+ vec_free (broadcast_swifs);
+ vec_free (buffers);
+
+ /* The regular path outputs the original pkt.. */
+ return vec_elt (am->arp_input_next_index_by_hw_if_index, hi->hw_if_index);
+}
+
+static uword
+arp_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im4 = &ip4_main;
+ u32 n_left_from, next_index, * from, * to_next;
+ u32 n_replies_sent = 0, n_proxy_arp_replies_sent = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (ethernet_arp_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ vnet_hw_interface_t * hw_if0;
+ ethernet_arp_header_t * arp0;
+ ethernet_header_t * eth0;
+ ip_interface_address_t * ifa0;
+ ip_adjacency_t * adj0;
+ ip4_address_t * if_addr0;
+ ip4_address_t proxy_src;
+ u32 pi0, error0, next0, sw_if_index0;
+ u8 is_request0, src_is_local0, dst_is_local0, is_unnum0;
+ ethernet_proxy_arp_t * pa;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ arp0 = vlib_buffer_get_current (p0);
+
+ is_request0 = arp0->opcode
+ == clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_request);
+
+ error0 = ETHERNET_ARP_ERROR_replies_sent;
+
+ error0 = (arp0->l2_type != clib_net_to_host_u16 (ETHERNET_ARP_HARDWARE_TYPE_ethernet)
+ ? ETHERNET_ARP_ERROR_l2_type_not_ethernet
+ : error0);
+ error0 = (arp0->l3_type != clib_net_to_host_u16 (ETHERNET_TYPE_IP4)
+ ? ETHERNET_ARP_ERROR_l3_type_not_ip4
+ : error0);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+
+ if (error0)
+ goto drop1;
+
+ /* Check that IP address is local and matches incoming interface. */
+ if_addr0 = ip4_interface_address_matching_destination (im4,
+ &arp0->ip4_over_ethernet[1].ip4,
+ sw_if_index0,
+ &ifa0);
+ if (! if_addr0)
+ {
+ error0 = ETHERNET_ARP_ERROR_l3_dst_address_not_local;
+ goto drop1;
+ }
+
+ /* Honor unnumbered interface, if any */
+ is_unnum0 = sw_if_index0 != ifa0->sw_if_index;
+
+ /* Source must also be local to subnet of matching interface address. */
+ if (! ip4_destination_matches_interface (im4, &arp0->ip4_over_ethernet[0].ip4, ifa0))
+ {
+ error0 = ETHERNET_ARP_ERROR_l3_src_address_not_local;
+ goto drop1;
+ }
+
+ /* Reject requests/replies with our local interface address. */
+ src_is_local0 = if_addr0->as_u32 == arp0->ip4_over_ethernet[0].ip4.as_u32;
+ if (src_is_local0)
+ {
+ error0 = ETHERNET_ARP_ERROR_l3_src_address_is_local;
+ goto drop1;
+ }
+
+ dst_is_local0 = if_addr0->as_u32 == arp0->ip4_over_ethernet[1].ip4.as_u32;
+
+ /* Fill in ethernet header. */
+ eth0 = ethernet_buffer_get_header (p0);
+
+ /* Trash ARP packets whose ARP-level source addresses do not
+ match their L2-frame-level source addresses */
+ if (memcmp (eth0->src_address, arp0->ip4_over_ethernet[0].ethernet,
+ sizeof (eth0->src_address)))
+ {
+ error0 = ETHERNET_ARP_ERROR_l2_address_mismatch;
+ goto drop2;
+ }
+
+ /* Learn or update sender's mapping only for requests or unicasts
+ that don't match local interface address. */
+ if (ethernet_address_cast (eth0->dst_address) == ETHERNET_ADDRESS_UNICAST
+ || is_request0)
+ {
+ if (am->limit_arp_cache_size &&
+ pool_elts (am->ip4_entry_pool) >= am->limit_arp_cache_size)
+ unset_random_arp_entry();
+
+ vnet_arp_set_ip4_over_ethernet (vnm, sw_if_index0,
+ (u32)~0 /* default fib */,
+ &arp0->ip4_over_ethernet[0],
+ 0 /* is_static */);
+ error0 = ETHERNET_ARP_ERROR_l3_src_address_learned;
+ }
+
+ /* Only send a reply for requests sent which match a local interface. */
+ if (! (is_request0 && dst_is_local0))
+ {
+ error0 = (arp0->opcode == clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply)
+ ? ETHERNET_ARP_ERROR_replies_received : error0);
+ goto drop1;
+ }
+
+ /* Send a reply. */
+ send_reply:
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ /* Can happen in a multi-core env. */
+ if (PREDICT_FALSE(hw_if0->hw_if_index >= vec_len (am->arp_input_next_index_by_hw_if_index)))
+ {
+ error0 = ETHERNET_ARP_ERROR_missing_interface_address;
+ goto drop2;
+ }
+
+ next0 = vec_elt (am->arp_input_next_index_by_hw_if_index, hw_if0->hw_if_index);
+
+ arp0->opcode = clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply);
+
+ arp0->ip4_over_ethernet[1] = arp0->ip4_over_ethernet[0];
+
+ memcpy (arp0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address, 6);
+ clib_mem_unaligned (&arp0->ip4_over_ethernet[0].ip4.data_u32, u32) = if_addr0->data_u32;
+
+ /* Hardware must be ethernet-like. */
+ ASSERT (vec_len (hw_if0->hw_address) == 6);
+
+ memcpy (eth0->dst_address, eth0->src_address, 6);
+ memcpy (eth0->src_address, hw_if0->hw_address, 6);
+
+ /* Figure out how much to rewind current data from adjacency. */
+ if (ifa0)
+ {
+ adj0 = ip_get_adjacency (&ip4_main.lookup_main,
+ ifa0->neighbor_probe_adj_index);
+ if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP)
+ {
+ error0 = ETHERNET_ARP_ERROR_missing_interface_address;
+ goto drop2;
+ }
+ if (is_unnum0)
+ next0 = arp_unnumbered (p0, pi0, eth0, ifa0);
+ else
+ vlib_buffer_advance (p0, -adj0->rewrite_header.data_bytes);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,to_next,
+ n_left_to_next,pi0,next0);
+
+ n_replies_sent += 1;
+ continue;
+
+ drop1:
+ if (0 == arp0->ip4_over_ethernet[0].ip4.as_u32 ||
+ (arp0->ip4_over_ethernet[0].ip4.as_u32 ==
+ arp0->ip4_over_ethernet[1].ip4.as_u32))
+ {
+ error0 = ETHERNET_ARP_ERROR_gratuitous_arp;
+ goto drop2;
+ }
+ /* See if proxy arp is configured for the address */
+ if (is_request0)
+ {
+ vnet_sw_interface_t * si;
+ u32 this_addr = clib_net_to_host_u32
+ (arp0->ip4_over_ethernet[1].ip4.as_u32);
+ u32 fib_index0;
+
+ si = vnet_get_sw_interface (vnm, sw_if_index0);
+
+ if (!(si->flags & VNET_SW_INTERFACE_FLAG_PROXY_ARP))
+ goto drop2;
+
+ fib_index0 = vec_elt (im4->fib_index_by_sw_if_index,
+ sw_if_index0);
+
+ vec_foreach (pa, am->proxy_arps)
+ {
+ u32 lo_addr = clib_net_to_host_u32 (pa->lo_addr);
+ u32 hi_addr = clib_net_to_host_u32 (pa->hi_addr);
+
+ /* an ARP request hit in the proxy-arp table? */
+ if ((this_addr >= lo_addr && this_addr <= hi_addr) &&
+ (fib_index0 == pa->fib_index))
+ {
+ eth0 = ethernet_buffer_get_header (p0);
+ proxy_src.as_u32 =
+ arp0->ip4_over_ethernet[1].ip4.data_u32;
+
+ /*
+ * Rewind buffer, direct code above not to
+ * think too hard about it.
+ * $$$ is the answer ever anything other than
+ * vlib_buffer_reset(..)?
+ */
+ ifa0 = 0;
+ if_addr0 = &proxy_src;
+ vlib_buffer_reset (p0);
+ n_proxy_arp_replies_sent++;
+ goto send_reply;
+ }
+ }
+ }
+
+ drop2:
+
+ next0 = ARP_INPUT_NEXT_DROP;
+ p0->error = node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,to_next,
+ n_left_to_next,pi0,next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_error_count (vm, node->node_index,
+ ETHERNET_ARP_ERROR_replies_sent,
+ n_replies_sent - n_proxy_arp_replies_sent);
+
+ vlib_error_count (vm, node->node_index,
+ ETHERNET_ARP_ERROR_proxy_arp_replies_sent,
+ n_proxy_arp_replies_sent);
+ return frame->n_vectors;
+}
+
+static char * ethernet_arp_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ethernet_arp_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (arp_input_node,static) = {
+ .function = arp_input,
+ .name = "arp-input",
+ .vector_size = sizeof (u32),
+
+ .n_errors = ETHERNET_ARP_N_ERROR,
+ .error_strings = ethernet_arp_error_strings,
+
+ .n_next_nodes = ARP_INPUT_N_NEXT,
+ .next_nodes = {
+ [ARP_INPUT_NEXT_DROP] = "error-drop",
+ },
+
+ .format_buffer = format_ethernet_arp_header,
+ .format_trace = format_ethernet_arp_input_trace,
+};
+
+clib_error_t *
+ethernet_arp_hw_interface_link_up_down (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 flags)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ vnet_hw_interface_t * hw_if;
+
+ hw_if = vnet_get_hw_interface (vnm, hw_if_index);
+
+ /* Fill in lookup tables with default table (0). */
+ vec_validate_init_empty (am->arp_input_next_index_by_hw_if_index, hw_if_index, ~0);
+ am->arp_input_next_index_by_hw_if_index[hw_if_index]
+ = vlib_node_add_next (vnm->vlib_main, arp_input_node.index, hw_if->output_node_index);
+
+ return 0;
+}
+
+VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION (ethernet_arp_hw_interface_link_up_down);
+
+static int
+ip4_arp_entry_sort (void *a1, void *a2)
+{
+ ethernet_arp_ip4_entry_t * e1 = a1;
+ ethernet_arp_ip4_entry_t * e2 = a2;
+
+ int cmp;
+ vnet_main_t * vnm = vnet_get_main();
+
+ cmp = vnet_sw_interface_compare
+ (vnm, e1->key.sw_if_index, e2->key.sw_if_index);
+ if (! cmp)
+ cmp = ip4_address_compare (&e1->key.ip4_address, &e2->key.ip4_address);
+ return cmp;
+}
+
+static clib_error_t *
+show_ip4_arp (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ethernet_arp_ip4_entry_t * e, * es;
+ ethernet_proxy_arp_t * pa;
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ /* Filter entries by interface if given. */
+ sw_if_index = ~0;
+ (void) unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index);
+
+ es = 0;
+ pool_foreach (e, am->ip4_entry_pool, ({ vec_add1 (es, e[0]); }));
+ vec_sort_with_function (es, ip4_arp_entry_sort);
+ vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, 0);
+ vec_foreach (e, es) {
+ if (sw_if_index != ~0 && e->key.sw_if_index != sw_if_index)
+ continue;
+ vlib_cli_output (vm, "%U", format_ethernet_arp_ip4_entry, vnm, e);
+ }
+ vec_free (es);
+
+ if (vec_len (am->proxy_arps))
+ {
+ vlib_cli_output (vm, "Proxy arps enabled for:");
+ vec_foreach(pa, am->proxy_arps)
+ {
+ vlib_cli_output (vm, "Fib_index %d %U - %U ",
+ pa->fib_index,
+ format_ip4_address, &pa->lo_addr,
+ format_ip4_address, &pa->hi_addr);
+ }
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_ip4_arp_command, static) = {
+ .path = "show ip arp",
+ .function = show_ip4_arp,
+ .short_help = "Show ARP table",
+};
+
+typedef struct {
+ pg_edit_t l2_type, l3_type;
+ pg_edit_t n_l2_address_bytes, n_l3_address_bytes;
+ pg_edit_t opcode;
+ struct {
+ pg_edit_t ethernet;
+ pg_edit_t ip4;
+ } ip4_over_ethernet[2];
+} pg_ethernet_arp_header_t;
+
+static inline void
+pg_ethernet_arp_header_init (pg_ethernet_arp_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, ethernet_arp_header_t, f);
+ _ (l2_type);
+ _ (l3_type);
+ _ (n_l2_address_bytes);
+ _ (n_l3_address_bytes);
+ _ (opcode);
+ _ (ip4_over_ethernet[0].ethernet);
+ _ (ip4_over_ethernet[0].ip4);
+ _ (ip4_over_ethernet[1].ethernet);
+ _ (ip4_over_ethernet[1].ip4);
+#undef _
+}
+
+uword
+unformat_pg_arp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_ethernet_arp_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ethernet_arp_header_t),
+ &group_index);
+ pg_ethernet_arp_header_init (p);
+
+ /* Defaults. */
+ pg_edit_set_fixed (&p->l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
+ pg_edit_set_fixed (&p->l3_type, ETHERNET_TYPE_IP4);
+ pg_edit_set_fixed (&p->n_l2_address_bytes, 6);
+ pg_edit_set_fixed (&p->n_l3_address_bytes, 4);
+
+ if (! unformat (input, "%U: %U/%U -> %U/%U",
+ unformat_pg_edit,
+ unformat_ethernet_arp_opcode_net_byte_order, &p->opcode,
+ unformat_pg_edit,
+ unformat_ethernet_address, &p->ip4_over_ethernet[0].ethernet,
+ unformat_pg_edit,
+ unformat_ip4_address, &p->ip4_over_ethernet[0].ip4,
+ unformat_pg_edit,
+ unformat_ethernet_address, &p->ip4_over_ethernet[1].ethernet,
+ unformat_pg_edit,
+ unformat_ip4_address, &p->ip4_over_ethernet[1].ip4))
+ {
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+ }
+ return 1;
+}
+
+clib_error_t *ip4_set_arp_limit (u32 arp_limit)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+
+ am->limit_arp_cache_size = arp_limit;
+ return 0;
+}
+
+static clib_error_t * ethernet_arp_init (vlib_main_t * vm)
+{
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ pg_node_t * pn;
+
+ ethernet_register_input_type (vm, ETHERNET_TYPE_ARP, arp_input_node.index);
+
+ pn = pg_get_node (arp_input_node.index);
+ pn->unformat_edit = unformat_pg_arp_header;
+
+ am->opcode_by_name = hash_create_string (0, sizeof (uword));
+#define _(o) hash_set_mem (am->opcode_by_name, #o, ETHERNET_ARP_OPCODE_##o);
+ foreach_ethernet_arp_opcode;
+#undef _
+
+ mhash_init (&am->ip4_entry_by_key,
+ /* value size */ sizeof (uword),
+ /* key size */ sizeof (ethernet_arp_ip4_key_t));
+
+ /* $$$ configurable */
+ am->limit_arp_cache_size = 50000;
+
+ am->pending_resolutions_by_address = hash_create (0, sizeof (uword));
+ am->mac_changes_by_address = hash_create (0, sizeof (uword));
+
+ /* don't trace ARP error packets */
+ {
+ vlib_node_runtime_t *rt =
+ vlib_node_get_runtime (vm, arp_input_node.index);
+
+#define _(a,b) \
+ vnet_pcap_drop_trace_filter_add_del \
+ (rt->errors[ETHERNET_ARP_ERROR_##a], \
+ 1 /* is_add */);
+ foreach_ethernet_arp_error
+#undef _
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ethernet_arp_init);
+
+int
+vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
+ u32 sw_if_index, u32 fib_index,
+ void * a_arg)
+{
+ ethernet_arp_ip4_over_ethernet_address_t * a = a_arg;
+ vnet_arp_set_ip4_over_ethernet_rpc_args_t args;
+
+ args.sw_if_index = sw_if_index;
+ args.fib_index = fib_index;
+ args.is_remove = 1;
+ memcpy (&args.a, a, sizeof (*a));
+
+ vl_api_rpc_call_main_thread (set_ip4_over_ethernet_rpc_callback,
+ (u8 *) &args, sizeof (args));
+ return 0;
+}
+
+static inline int
+vnet_arp_unset_ip4_over_ethernet_internal (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void * a_arg)
+{
+ ethernet_arp_ip4_entry_t * e;
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ ethernet_arp_ip4_over_ethernet_address_t * a = a_arg;
+ ethernet_arp_ip4_key_t k;
+ uword * p;
+ ip4_add_del_route_args_t args;
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 adj_index;
+ ip_adjacency_t * adj;
+
+ k.sw_if_index = sw_if_index;
+ k.ip4_address = a->ip4;
+ k.fib_index = fib_index;
+ p = mhash_get (&am->ip4_entry_by_key, &k);
+ if (! p)
+ return -1;
+
+ memset(&args, 0, sizeof(args));
+
+ /*
+ * Make sure that the route actually exists before we try to delete it,
+ * and make sure that it's a rewrite adjacency.
+ *
+ * If we point 1-N unnumbered interfaces at a loopback interface and
+ * shut down the loopback before shutting down 1-N unnumbered
+ * interfaces, the ARP cache will still have an entry,
+ * but the route will have disappeared.
+ *
+ * See also ip4_del_interface_routes (...)
+ * -> ip4_delete_matching_routes (...).
+ */
+
+ adj_index = ip4_fib_lookup_with_table
+ (im, fib_index, &a->ip4, 1 /* disable default route */);
+
+ /* Miss adj? Forget it... */
+ if (adj_index != lm->miss_adj_index) {
+ adj = ip_get_adjacency (lm, adj_index);
+ /*
+ * Stupid control-plane trick:
+ * admin down an interface (removes arp routes from fib),
+ * bring the interface back up (does not reinstall them)
+ * then remove the arp cache entry (yuck). When that happens,
+ * the adj we find here will be the interface subnet ARP adj.
+ */
+ if (adj->lookup_next_index == IP_LOOKUP_NEXT_REWRITE) {
+ args.table_index_or_table_id = fib_index;
+ args.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL
+ | IP4_ROUTE_FLAG_NEIGHBOR;
+ args.dst_address = a->ip4;
+ args.dst_address_length = 32;
+ ip4_add_del_route (im, &args);
+ ip4_maybe_remap_adjacencies (im, fib_index, args.flags);
+ }
+ }
+
+ e = pool_elt_at_index (am->ip4_entry_pool, p[0]);
+ mhash_unset (&am->ip4_entry_by_key, &e->key, 0);
+ pool_put (am->ip4_entry_pool, e);
+ return 0;
+}
+
+static void
+increment_ip4_and_mac_address (ethernet_arp_ip4_over_ethernet_address_t *a)
+{
+ u8 old;
+ int i;
+
+ for (i = 3; i >= 0; i--)
+ {
+ old = a->ip4.as_u8[i];
+ a->ip4.as_u8[i] += 1;
+ if (old < a->ip4.as_u8[i])
+ break;
+ }
+
+ for (i = 5; i >= 0; i--)
+ {
+ old = a->ethernet[i];
+ a->ethernet[i] += 1;
+ if (old < a->ethernet[i])
+ break;
+ }
+}
+
+int vnet_proxy_arp_add_del (ip4_address_t *lo_addr,
+ ip4_address_t *hi_addr,
+ u32 fib_index, int is_del)
+{
+ ethernet_arp_main_t *am = &ethernet_arp_main;
+ ethernet_proxy_arp_t *pa;
+ u32 found_at_index = ~0;
+
+ vec_foreach (pa, am->proxy_arps)
+ {
+ if (pa->lo_addr == lo_addr->as_u32
+ && pa->hi_addr == hi_addr->as_u32
+ && pa->fib_index == fib_index)
+ {
+ found_at_index = pa - am->proxy_arps;
+ break;
+ }
+ }
+
+ if (found_at_index != ~0)
+ {
+ /* Delete, otherwise it's already in the table */
+ if (is_del)
+ vec_delete (am->proxy_arps, 1, found_at_index);
+ return 0;
+ }
+ /* delete, no such entry */
+ if (is_del)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ /* add, not in table */
+ vec_add2 (am->proxy_arps, pa, 1);
+ pa->lo_addr = lo_addr->as_u32;
+ pa->hi_addr = hi_addr->as_u32;
+ pa->fib_index = fib_index;
+ return 0;
+}
+
+/*
+ * Remove any proxy arp entries asdociated with the
+ * specificed fib.
+ */
+int vnet_proxy_arp_fib_reset (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ ethernet_arp_main_t *am = &ethernet_arp_main;
+ ethernet_proxy_arp_t *pa;
+ u32 * entries_to_delete = 0;
+ u32 fib_index;
+ uword * p;
+ int i;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ fib_index = p[0];
+
+ vec_foreach (pa, am->proxy_arps)
+ {
+ if (pa->fib_index == fib_index)
+ {
+ vec_add1 (entries_to_delete, pa - am->proxy_arps);
+ }
+ }
+
+ for (i = 0; i < vec_len(entries_to_delete); i++)
+ {
+ vec_delete (am->proxy_arps, 1, entries_to_delete[i]);
+ }
+
+ vec_free (entries_to_delete);
+
+ return 0;
+}
+
+static clib_error_t *
+ip_arp_add_del_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index;
+ ethernet_arp_ip4_over_ethernet_address_t lo_addr, hi_addr, addr;
+ int addr_valid = 0;
+ int is_del = 0;
+ int count = 1;
+ u32 fib_index = 0;
+ u32 fib_id;
+ int is_static = 0;
+ int is_proxy = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* set ip arp TenGigE1/1/0/1 1.2.3.4 aa:bb:... or aabb.ccdd... */
+ if (unformat (input, "%U %U %U",
+ unformat_vnet_sw_interface, vnm, &sw_if_index,
+ unformat_ip4_address, &addr.ip4,
+ unformat_ethernet_address, &addr.ethernet))
+ addr_valid = 1;
+
+ else if (unformat (input, "delete") || unformat (input, "del"))
+ is_del = 1;
+
+ else if (unformat (input, "static"))
+ is_static = 1;
+
+ else if (unformat (input, "count %d", &count))
+ ;
+
+ else if (unformat (input, "fib-id %d", &fib_id))
+ {
+ ip4_main_t * im = &ip4_main;
+ uword * p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (! p)
+ return clib_error_return (0, "fib ID %d doesn't exist\n",
+ fib_id);
+ fib_index = p[0];
+ }
+
+ else if (unformat (input, "proxy %U - %U",
+ unformat_ip4_address, &lo_addr.ip4,
+ unformat_ip4_address, &hi_addr.ip4))
+ is_proxy = 1;
+ else
+ break;
+ }
+
+ if (is_proxy)
+ {
+ (void) vnet_proxy_arp_add_del (&lo_addr.ip4, &hi_addr.ip4,
+ fib_index, is_del);
+ return 0;
+ }
+
+ if (addr_valid)
+ {
+ int i;
+
+ for (i = 0; i < count; i++)
+ {
+ if (is_del == 0)
+ {
+ uword event_type, * event_data = 0;
+
+ /* Park the debug CLI until the arp entry is installed */
+ vnet_register_ip4_arp_resolution_event
+ (vnm, &addr.ip4, vlib_current_process(vm),
+ 1 /* type */, 0 /* data */);
+
+ vnet_arp_set_ip4_over_ethernet
+ (vnm, sw_if_index, fib_index, &addr, is_static);
+
+ vlib_process_wait_for_event (vm);
+ event_type = vlib_process_get_events (vm, &event_data);
+ vec_reset_length(event_data);
+ if (event_type != 1)
+ clib_warning ("event type %d unexpected", event_type);
+ }
+ else
+ vnet_arp_unset_ip4_over_ethernet
+ (vnm, sw_if_index, fib_index, &addr);
+
+ increment_ip4_and_mac_address (&addr);
+ }
+ }
+ else
+ {
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ip_arp_add_del_command, static) = {
+ .path = "set ip arp",
+ .short_help = "set ip arp [del] <intfc> <ip-address> <mac-address>",
+ .function = ip_arp_add_del_command_fn,
+};
+
+static clib_error_t *
+set_int_proxy_arp_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index;
+ vnet_sw_interface_t * si;
+ int enable = 0;
+ int intfc_set = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ intfc_set = 1;
+ else if (unformat (input, "enable") || unformat (input, "on"))
+ enable = 1;
+ else if (unformat (input, "disable") || unformat (input, "off"))
+ enable = 0;
+ else
+ break;
+ }
+
+ if (intfc_set == 0)
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, input);
+
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+ ASSERT(si);
+ if (enable)
+ si->flags |= VNET_SW_INTERFACE_FLAG_PROXY_ARP;
+ else
+ si->flags &= ~VNET_SW_INTERFACE_FLAG_PROXY_ARP;
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_int_proxy_enable_command, static) = {
+ .path = "set interface proxy-arp",
+ .short_help = "set interface proxy-arp <intfc> [enable|disable]",
+ .function = set_int_proxy_arp_command_fn,
+};
+
+
+/*
+ * ARP Termination in a L2 Bridge Domain based on an
+ * IP4 to MAC hash table mac_by_ip4 for each BD.
+ */
+typedef enum {
+ ARP_TERM_NEXT_L2_OUTPUT,
+ ARP_TERM_NEXT_DROP,
+ ARP_TERM_N_NEXT,
+} arp_term_next_t;
+
+u32 arp_term_next_node_index[32];
+
+static uword
+arp_term_l2bd (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ l2input_main_t * l2im = &l2input_main;
+ u32 n_left_from, next_index, * from, * to_next;
+ u32 n_replies_sent = 0;
+ u16 last_bd_index = ~0;
+ l2_bridge_domain_t * last_bd_config = 0;
+ l2_input_config_t * cfg0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ethernet_header_t * eth0;
+ ethernet_arp_header_t * arp0;
+ u8 * l3h0;
+ u32 pi0, error0, next0, sw_if_index0;
+ u16 ethertype0;
+ u16 bd_index0;
+ u32 ip0;
+ u8 * macp0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ eth0 = vlib_buffer_get_current (p0);
+ l3h0 = (u8 *)eth0 + vnet_buffer(p0)->l2.l2_len;
+ ethertype0 = clib_net_to_host_u16(*(u16 *)(l3h0 - 2));
+ arp0 = (ethernet_arp_header_t *) l3h0;
+
+ if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) &&
+ (p0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ u8 *t0 = vlib_add_trace (
+ vm, node, p0, sizeof(ethernet_arp_input_trace_t));
+ memcpy (t0, l3h0, sizeof(ethernet_arp_input_trace_t));
+ }
+
+ if (PREDICT_FALSE (
+ (ethertype0 != ETHERNET_TYPE_ARP) ||
+ (arp0->opcode != clib_host_to_net_u16(ETHERNET_ARP_OPCODE_request))))
+ goto next_l2_feature;
+
+ error0 = ETHERNET_ARP_ERROR_replies_sent;
+ error0 = (arp0->l2_type != clib_net_to_host_u16 (ETHERNET_ARP_HARDWARE_TYPE_ethernet)
+ ? ETHERNET_ARP_ERROR_l2_type_not_ethernet
+ : error0);
+ error0 = (arp0->l3_type != clib_net_to_host_u16 (ETHERNET_TYPE_IP4)
+ ? ETHERNET_ARP_ERROR_l3_type_not_ip4
+ : error0);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+
+ if (error0)
+ goto drop;
+
+ // Trash ARP packets whose ARP-level source addresses do not
+ // match their L2-frame-level source addresses */
+ if (PREDICT_FALSE (
+ memcmp (eth0->src_address, arp0->ip4_over_ethernet[0].ethernet,
+ sizeof (eth0->src_address))))
+ {
+ error0 = ETHERNET_ARP_ERROR_l2_address_mismatch;
+ goto drop;
+ }
+
+ // Check if anyone want ARP request events for L2 BDs
+ {
+ pending_resolution_t * mc;
+ ethernet_arp_main_t * am = &ethernet_arp_main;
+ uword *p = hash_get (am->mac_changes_by_address, 0);
+ if (p && (vnet_buffer(p0)->l2.shg == 0))
+ { // Only SHG 0 interface which is more likely local
+ u32 next_index = p[0];
+ while (next_index != (u32)~0)
+ {
+ int (*fp)(u32, u8 *, u32, u32);
+ int rv = 1;
+ mc = pool_elt_at_index (am->mac_changes, next_index);
+ fp = mc->data_callback;
+ // Call the callback, return 1 to suppress dup events */
+ if (fp) rv = (*fp)(mc->data,
+ arp0->ip4_over_ethernet[0].ethernet,
+ sw_if_index0,
+ arp0->ip4_over_ethernet[0].ip4.as_u32);
+ // Signal the resolver process
+ if (rv == 0)
+ vlib_process_signal_event (vm, mc->node_index,
+ mc->type_opaque,
+ mc->data);
+ next_index = mc->next_index;
+ }
+ }
+ }
+
+ // lookup BD mac_by_ip4 hash table for MAC entry
+ ip0 = arp0->ip4_over_ethernet[1].ip4.as_u32;
+ bd_index0 = vnet_buffer(p0)->l2.bd_index;
+ if (PREDICT_FALSE (
+ (bd_index0 != last_bd_index) || (last_bd_index == ~0)))
+ {
+ last_bd_index = bd_index0;
+ last_bd_config = vec_elt_at_index(l2im->bd_configs, bd_index0);
+ }
+ macp0 = (u8 *) hash_get (last_bd_config->mac_by_ip4, ip0);
+
+ if (PREDICT_FALSE(!macp0))
+ goto next_l2_feature; // MAC not found
+
+ // MAC found, send ARP reply -
+ // Convert ARP request packet to ARP reply
+ arp0->opcode = clib_host_to_net_u16 (ETHERNET_ARP_OPCODE_reply);
+ arp0->ip4_over_ethernet[1] = arp0->ip4_over_ethernet[0];
+ arp0->ip4_over_ethernet[0].ip4.as_u32 = ip0;
+ memcpy (arp0->ip4_over_ethernet[0].ethernet, macp0, 6);
+ memcpy (eth0->dst_address, eth0->src_address, 6);
+ memcpy (eth0->src_address, macp0, 6);
+ n_replies_sent += 1;
+
+ // For BVI, need to use l2-fwd node to send ARP reply as
+ // l2-output node cannot output packet to BVI properly
+ cfg0 = vec_elt_at_index(l2im->configs, sw_if_index0);
+ if (PREDICT_FALSE (cfg0->bvi))
+ {
+ vnet_buffer(p0)->l2.feature_bitmap |= L2INPUT_FEAT_FWD;
+ vnet_buffer (p0)->sw_if_index[VLIB_RX] = 0;
+ goto next_l2_feature;
+ }
+
+ // Send ARP reply back out input interface through l2-output
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ next0 = ARP_TERM_NEXT_L2_OUTPUT;
+ // Note that output to VXLAN tunnel will fail due to SHG which
+ // is probably desireable since ARP termination is not intended
+ // for ARP requests from other hosts. If output to VXLAN tunnel is
+ // required, however, can just clear the SHG in packet as follows:
+ // vnet_buffer(p0)->l2.shg = 0;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,to_next,
+ n_left_to_next,pi0,next0);
+ continue;
+
+ next_l2_feature:
+ {
+ u32 feature_bitmap0 =
+ vnet_buffer(p0)->l2.feature_bitmap & ~L2INPUT_FEAT_ARP_TERM;
+ vnet_buffer(p0)->l2.feature_bitmap = feature_bitmap0;
+ next0 = feat_bitmap_get_next_node_index(arp_term_next_node_index,
+ feature_bitmap0);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,to_next,
+ n_left_to_next,pi0,next0);
+ continue;
+ }
+
+ drop:
+ if (0 == arp0->ip4_over_ethernet[0].ip4.as_u32 ||
+ (arp0->ip4_over_ethernet[0].ip4.as_u32 ==
+ arp0->ip4_over_ethernet[1].ip4.as_u32))
+ {
+ error0 = ETHERNET_ARP_ERROR_gratuitous_arp;
+ }
+ next0 = ARP_TERM_NEXT_DROP;
+ p0->error = node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,to_next,
+ n_left_to_next,pi0,next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_error_count (vm, node->node_index,
+ ETHERNET_ARP_ERROR_replies_sent,
+ n_replies_sent);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (arp_term_l2bd_node,static) = {
+ .function = arp_term_l2bd,
+ .name = "arp-term-l2bd",
+ .vector_size = sizeof (u32),
+
+ .n_errors = ETHERNET_ARP_N_ERROR,
+ .error_strings = ethernet_arp_error_strings,
+
+ .n_next_nodes = ARP_TERM_N_NEXT,
+ .next_nodes = {
+ [ARP_TERM_NEXT_L2_OUTPUT] = "l2-output",
+ [ARP_TERM_NEXT_DROP] = "error-drop",
+ },
+
+ .format_buffer = format_ethernet_arp_header,
+ .format_trace = format_ethernet_arp_input_trace,
+};
+
+clib_error_t *arp_term_init (vlib_main_t *vm)
+{ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ arp_term_l2bd_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ arp_term_next_node_index);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (arp_term_init);
diff --git a/vnet/vnet/ethernet/arp_packet.h b/vnet/vnet/ethernet/arp_packet.h
new file mode 100644
index 00000000000..c2214949d89
--- /dev/null
+++ b/vnet/vnet/ethernet/arp_packet.h
@@ -0,0 +1,134 @@
+/*
+ * ethernet/arp.c: IP v4 ARP node
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_ethernet_arp_packet_h
+#define included_ethernet_arp_packet_h
+
+#define foreach_ethernet_arp_hardware_type \
+ _ (0, reserved) \
+ _ (1, ethernet) \
+ _ (2, experimental_ethernet) \
+ _ (3, ax_25) \
+ _ (4, proteon_pronet_token_ring) \
+ _ (5, chaos) \
+ _ (6, ieee_802) \
+ _ (7, arcnet) \
+ _ (8, hyperchannel) \
+ _ (9, lanstar) \
+ _ (10, autonet) \
+ _ (11, localtalk) \
+ _ (12, localnet) \
+ _ (13, ultra_link) \
+ _ (14, smds) \
+ _ (15, frame_relay) \
+ _ (16, atm) \
+ _ (17, hdlc) \
+ _ (18, fibre_channel) \
+ _ (19, atm19) \
+ _ (20, serial_line) \
+ _ (21, atm21) \
+ _ (22, mil_std_188_220) \
+ _ (23, metricom) \
+ _ (24, ieee_1394) \
+ _ (25, mapos) \
+ _ (26, twinaxial) \
+ _ (27, eui_64) \
+ _ (28, hiparp) \
+ _ (29, iso_7816_3) \
+ _ (30, arpsec) \
+ _ (31, ipsec_tunnel) \
+ _ (32, infiniband) \
+ _ (33, cai) \
+ _ (34, wiegand) \
+ _ (35, pure_ip) \
+ _ (36, hw_exp1) \
+ _ (256, hw_exp2)
+
+#define foreach_ethernet_arp_opcode \
+ _ (reserved) \
+ _ (request) \
+ _ (reply) \
+ _ (reverse_request) \
+ _ (reverse_reply) \
+ _ (drarp_request) \
+ _ (drarp_reply) \
+ _ (drarp_error) \
+ _ (inarp_request) \
+ _ (inarp_reply) \
+ _ (arp_nak) \
+ _ (mars_request) \
+ _ (mars_multi) \
+ _ (mars_mserv) \
+ _ (mars_join) \
+ _ (mars_leave) \
+ _ (mars_nak) \
+ _ (mars_unserv) \
+ _ (mars_sjoin) \
+ _ (mars_sleave) \
+ _ (mars_grouplist_request) \
+ _ (mars_grouplist_reply) \
+ _ (mars_redirect_map) \
+ _ (mapos_unarp) \
+ _ (exp1) \
+ _ (exp2)
+
+typedef enum {
+#define _(n,f) ETHERNET_ARP_HARDWARE_TYPE_##f = (n),
+ foreach_ethernet_arp_hardware_type
+#undef _
+} ethernet_arp_hardware_type_t;
+
+typedef enum {
+#define _(f) ETHERNET_ARP_OPCODE_##f,
+ foreach_ethernet_arp_opcode
+#undef _
+ ETHERNET_ARP_N_OPCODE,
+} ethernet_arp_opcode_t;
+
+typedef enum {
+ IP4_ARP_NEXT_DROP,
+ IP4_ARP_N_NEXT,
+} ip4_arp_next_t;
+
+typedef enum {
+ IP4_ARP_ERROR_DROP,
+ IP4_ARP_ERROR_REQUEST_SENT,
+ IP4_ARP_ERROR_NON_ARP_ADJ,
+ IP4_ARP_ERROR_REPLICATE_DROP,
+ IP4_ARP_ERROR_REPLICATE_FAIL
+} ip4_arp_error_t;
+
+typedef CLIB_PACKED (struct {
+ u8 ethernet[6];
+ ip4_address_t ip4;
+}) ethernet_arp_ip4_over_ethernet_address_t;
+
+typedef struct {
+ u16 l2_type;
+ u16 l3_type;
+ u8 n_l2_address_bytes;
+ u8 n_l3_address_bytes;
+ u16 opcode;
+ union {
+ ethernet_arp_ip4_over_ethernet_address_t ip4_over_ethernet[2];
+
+ /* Others... */
+ u8 data[0];
+ };
+} ethernet_arp_header_t;
+
+#endif /* included_ethernet_arp_packet_h */
diff --git a/vnet/vnet/ethernet/cli.c b/vnet/vnet/ethernet/cli.c
new file mode 100644
index 00000000000..cbf240c7f68
--- /dev/null
+++ b/vnet/vnet/ethernet/cli.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.c: ethernet CLI
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+VLIB_CLI_COMMAND (vlib_cli_ethernet_command, static) = {
+ .path = "ethernet",
+ .short_help = "Ethernet commands",
+};
+
+static clib_error_t *
+promiscuous_cmd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 hw_if_index;
+ u32 flags = ETHERNET_INTERFACE_FLAG_ACCEPT_ALL;
+
+ if (unformat (input, "on %U",
+ unformat_ethernet_interface, vnm, &hw_if_index))
+ {
+ ethernet_set_flags (vnm, hw_if_index, flags);
+ }
+ else if (unformat (input, "off %U",
+ unformat_ethernet_interface, vnm, &hw_if_index))
+ {
+ flags = 0;
+ ethernet_set_flags (vnm, hw_if_index, flags);
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ethernet_promiscuous_command, static) = {
+ .path = "ethernet promiscuous",
+ .short_help = "ethernet promiscuous [on | off] <intfc>",
+ .function = promiscuous_cmd,
+};
+
+static clib_error_t *
+mtu_cmd (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 hw_if_index, mtu;
+ u32 flags = ETHERNET_INTERFACE_FLAG_MTU;
+
+ if (unformat (input, "%d %U", &mtu,
+ unformat_ethernet_interface, vnm, &hw_if_index))
+ {
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ if (mtu < ETHERNET_MIN_PACKET_BYTES)
+ return clib_error_return (0, "Invalid mtu (%d): "
+ "must be >= min pkt bytes (%d)", mtu,
+ hi->min_packet_bytes);
+
+ if (mtu > ETHERNET_MAX_PACKET_BYTES)
+ return clib_error_return (0, "Invalid mtu (%d): must be <= 9216", mtu);
+
+ if (hi->max_packet_bytes != mtu)
+ {
+ hi->max_packet_bytes = mtu;
+ ethernet_set_flags (vnm, hw_if_index, flags);
+ }
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ethernet_mtu_command, static) = {
+ .path = "ethernet mtu",
+ .short_help = "ethernet mtu <64-9216> <intfc>",
+ .function = mtu_cmd,
+};
+
+clib_error_t *
+ethernet_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ethernet_cli_init);
diff --git a/vnet/vnet/ethernet/error.def b/vnet/vnet/ethernet/error.def
new file mode 100644
index 00000000000..36679c0ce1c
--- /dev/null
+++ b/vnet/vnet/ethernet/error.def
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_error.def: ethernet errors
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+ethernet_error (NONE, PUNT, "no error")
+ethernet_error (BAD_LLC_LENGTH, DROP, "llc length > packet length")
+ethernet_error (UNKNOWN_TYPE, PUNT, "unknown ethernet type")
+ethernet_error (UNKNOWN_VLAN, DROP, "unknown vlan")
+ethernet_error (L3_MAC_MISMATCH, DROP, "l3 mac mismatch")
+ethernet_error (DOWN, DROP, "subinterface down")
+
diff --git a/vnet/vnet/ethernet/ethernet.h b/vnet/vnet/ethernet/ethernet.h
new file mode 100644
index 00000000000..266e1d79afb
--- /dev/null
+++ b/vnet/vnet/ethernet/ethernet.h
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet.h: types/functions for ethernet.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ethernet_h
+#define included_ethernet_h
+
+#include <vnet/vnet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/pg/pg.h>
+
+always_inline u64
+ethernet_mac_address_u64 (u8 * a)
+{ return (((u64) a[0] << (u64) (5*8))
+ | ((u64) a[1] << (u64) (4*8))
+ | ((u64) a[2] << (u64) (3*8))
+ | ((u64) a[3] << (u64) (2*8))
+ | ((u64) a[4] << (u64) (1*8))
+ | ((u64) a[5] << (u64) (0*8))); }
+
+static inline int ethernet_mac_address_is_multicast_u64 (u64 a)
+{
+ return (a & (1ULL<<(5*8))) != 0;
+}
+
+/* Max. sized ethernet/vlan header for parsing. */
+typedef struct {
+ ethernet_header_t ethernet;
+
+ /* Allow up to 2 stacked vlan headers. */
+ ethernet_vlan_header_t vlan[2];
+} ethernet_max_header_t;
+
+struct vnet_hw_interface_t;
+/* Ethernet flag change callback. */
+typedef u32 (ethernet_flag_change_function_t)
+(vnet_main_t * vnm, struct vnet_hw_interface_t * hi, u32 flags);
+
+#define ETHERNET_MIN_PACKET_BYTES 64
+#define ETHERNET_MAX_PACKET_BYTES 9216
+
+/* Ethernet interface instance. */
+typedef struct ethernet_interface {
+
+ /* Accept all packets (promiscuous mode). */
+#define ETHERNET_INTERFACE_FLAG_ACCEPT_ALL (1 << 0)
+#define ETHERNET_INTERFACE_FLAG_CONFIG_PROMISC(flags) \
+ (((flags) & ~ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) == 0)
+
+ /* Change MTU on interface from hw interface structure */
+#define ETHERNET_INTERFACE_FLAG_MTU (1 << 1)
+#define ETHERNET_INTERFACE_FLAG_CONFIG_MTU(flags) \
+ ((flags) & ETHERNET_INTERFACE_FLAG_MTU)
+
+ /* Callback, e.g. to turn on/off promiscuous mode */
+ ethernet_flag_change_function_t * flag_change;
+
+ u32 driver_instance;
+
+ /* Ethernet (MAC) address for this interface. */
+ u8 address[6];
+} ethernet_interface_t;
+
+vnet_hw_interface_class_t ethernet_hw_interface_class;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* Ethernet type in host byte order. */
+ ethernet_type_t type;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} ethernet_type_info_t;
+
+typedef enum {
+#define ethernet_error(n,c,s) ETHERNET_ERROR_##n,
+#include <vnet/ethernet/error.def>
+#undef ethernet_error
+ ETHERNET_N_ERROR,
+} ethernet_error_t;
+
+
+// Structs used when parsing packet to find sw_if_index
+
+typedef struct {
+ u32 sw_if_index;
+ u32 flags;
+ // config entry is-valid flag
+ // exact match flags (valid if packet has 0/1/2/3 tags)
+ // L2 vs L3 forwarding mode
+#define SUBINT_CONFIG_MATCH_0_TAG (1<<0)
+#define SUBINT_CONFIG_MATCH_1_TAG (1<<1)
+#define SUBINT_CONFIG_MATCH_2_TAG (1<<2)
+#define SUBINT_CONFIG_MATCH_3_TAG (1<<3)
+#define SUBINT_CONFIG_VALID (1<<4)
+#define SUBINT_CONFIG_L2 (1<<5)
+
+} subint_config_t;
+
+always_inline u32
+eth_create_valid_subint_match_flags (u32 num_tags) {
+ return SUBINT_CONFIG_VALID | (1 << num_tags);
+}
+
+
+typedef struct {
+ subint_config_t untagged_subint;
+ subint_config_t default_subint;
+ u16 dot1q_vlans; // pool id for vlan table
+ u16 dot1ad_vlans; // pool id for vlan table
+} main_intf_t;
+
+typedef struct {
+ subint_config_t single_tag_subint;
+ subint_config_t inner_any_subint;
+ u32 qinqs; // pool id for qinq table
+} vlan_intf_t;
+
+typedef struct {
+ vlan_intf_t vlans[ETHERNET_N_VLAN];
+} vlan_table_t;
+
+typedef struct {
+ subint_config_t subint;
+} qinq_intf_t;
+
+typedef struct {
+ qinq_intf_t vlans[ETHERNET_N_VLAN];
+} qinq_table_t;
+
+// Structure mapping to a next index based on ethertype.
+// Common ethertypes are stored explicitly, others are
+// stored in a sparse table.
+typedef struct {
+ /* Sparse vector mapping ethernet type in network byte order
+ to next index. */
+ u16 * input_next_by_type;
+ u32 * sparse_index_by_input_next_index;
+
+ /* cached next indexes for common ethertypes */
+ u32 input_next_ip4;
+ u32 input_next_ip6;
+ u32 input_next_mpls;
+} next_by_ethertype_t;
+
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ /* next node index for the L3 input node of each ethertype */
+ next_by_ethertype_t l3_next;
+
+ /* next node index for L2 interfaces */
+ u32 l2_next;
+
+ /* flag and next node index for L3 redirect */
+ u32 redirect_l3;
+ u32 redirect_l3_next;
+
+ /* Pool of ethernet interface instances. */
+ ethernet_interface_t * interfaces;
+
+ ethernet_type_info_t * type_infos;
+
+ /* Hash tables mapping name/type to type info index. */
+ uword * type_info_by_name, * type_info_by_type;
+
+ // The root of the vlan parsing tables. A vector with one element
+ // for each main interface, indexed by hw_if_index.
+ main_intf_t * main_intfs;
+
+ // Pool of vlan tables
+ vlan_table_t * vlan_pool;
+
+ // Pool of qinq tables;
+ qinq_table_t * qinq_pool;
+
+ /* Set to one to use AB.CD.EF instead of A:B:C:D:E:F as ethernet format. */
+ int format_ethernet_address_16bit;
+
+} ethernet_main_t;
+
+ethernet_main_t ethernet_main;
+
+always_inline ethernet_type_info_t *
+ethernet_get_type_info (ethernet_main_t * em, ethernet_type_t type)
+{
+ uword * p = hash_get (em->type_info_by_type, type);
+ return p ? vec_elt_at_index (em->type_infos, p[0]) : 0;
+}
+
+ethernet_interface_t *
+ethernet_get_interface (ethernet_main_t * em, u32 hw_if_index);
+
+clib_error_t *
+ethernet_register_interface (vnet_main_t * vnm,
+ u32 dev_class_index,
+ u32 dev_instance,
+ u8 * address,
+ u32 * hw_if_index_return,
+ ethernet_flag_change_function_t flag_change);
+
+void ethernet_delete_interface (vnet_main_t * vnm, u32 hw_if_index);
+
+/* Register given node index to take input for given ethernet type. */
+void
+ethernet_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index);
+
+/* Register given node index to take input for packet from L2 interfaces. */
+void
+ethernet_register_l2_input (vlib_main_t * vm,
+ u32 node_index);
+
+/* Register given node index to take redirected L3 traffic, and enable L3 redirect */
+void
+ethernet_register_l3_redirect (vlib_main_t * vm,
+ u32 node_index);
+
+/* Formats ethernet address X:X:X:X:X:X */
+u8 * format_ethernet_address (u8 * s, va_list * args);
+u8 * format_ethernet_type (u8 * s, va_list * args);
+u8 * format_ethernet_header (u8 * s, va_list * args);
+u8 * format_ethernet_header_with_length (u8 * s, va_list * args);
+
+/* Parse ethernet address in either X:X:X:X:X:X unix or X.X.X cisco format. */
+uword
+unformat_ethernet_address (unformat_input_t * input, va_list * args);
+
+/* Parse ethernet type as 0xXXXX or type name from ethernet/types.def.
+ In either host or network byte order. */
+uword
+unformat_ethernet_type_host_byte_order (unformat_input_t * input,
+ va_list * args);
+uword
+unformat_ethernet_type_net_byte_order (unformat_input_t * input,
+ va_list * args);
+
+/* Parse ethernet header. */
+uword
+unformat_ethernet_header (unformat_input_t * input, va_list * args);
+
+/* Parse ethernet interface name; return hw_if_index. */
+uword unformat_ethernet_interface (unformat_input_t * input, va_list * args);
+
+uword unformat_pg_ethernet_header (unformat_input_t * input, va_list * args);
+
+always_inline void
+ethernet_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_ethernet_header_with_length;
+ n->unformat_buffer = unformat_ethernet_header;
+ pn->unformat_edit = unformat_pg_ethernet_header;
+}
+
+always_inline ethernet_header_t *
+ethernet_buffer_get_header (vlib_buffer_t * b)
+{
+ return (void *)
+ (b->data
+ + vnet_buffer (b)->ethernet.start_of_ethernet_header);
+}
+
+ethernet_main_t * ethernet_get_main (vlib_main_t * vm);
+u32 ethernet_set_flags (vnet_main_t * vnm, u32 hw_if_index, u32 flags);
+void ethernet_sw_interface_set_l2_mode (vnet_main_t * vnm, u32 sw_if_index, u32 l2);
+void ethernet_set_rx_redirect (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 enable);
+
+int
+vnet_arp_set_ip4_over_ethernet (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 fib_index,
+ void *a_arg,
+ int is_static);
+
+int
+vnet_arp_unset_ip4_over_ethernet (vnet_main_t * vnm,
+ u32 sw_if_index, u32 fib_index,
+ void * a_arg);
+
+int vnet_proxy_arp_fib_reset (u32 fib_id);
+
+clib_error_t * next_by_ethertype_init (next_by_ethertype_t * l3_next);
+clib_error_t * next_by_ethertype_register (next_by_ethertype_t * l3_next,
+ u32 ethertype,
+ u32 next_index);
+
+int vnet_create_loopback_interface (u32 * sw_if_indexp, u8 *mac_address);
+int vnet_delete_loopback_interface (u32 sw_if_index);
+
+// Perform ethernet subinterface classification table lookups given
+// the ports's sw_if_index and fields extracted from the ethernet header.
+// The resulting tables are used by identify_subint().
+always_inline void
+eth_vlan_table_lookups (ethernet_main_t *em,
+ vnet_main_t * vnm,
+ u32 port_sw_if_index0,
+ u16 first_ethertype,
+ u16 outer_id,
+ u16 inner_id,
+ vnet_hw_interface_t ** hi,
+ main_intf_t **main_intf,
+ vlan_intf_t **vlan_intf,
+ qinq_intf_t **qinq_intf)
+{
+ vlan_table_t *vlan_table;
+ qinq_table_t *qinq_table;
+ u32 vlan_table_id;
+
+ // Read the main, vlan, and qinq interface table entries
+ // TODO: Consider if/how to prefetch tables. Also consider
+ // single-entry cache to skip table lookups and identify_subint()
+ // processing.
+ *hi = vnet_get_sup_hw_interface (vnm, port_sw_if_index0);
+ *main_intf = vec_elt_at_index (em->main_intfs, (*hi)->hw_if_index);
+
+ // Always read the vlan and qinq tables, even if there are not that
+ // many tags on the packet. This makes the lookups and comparisons
+ // easier (and less branchy).
+ vlan_table_id = (first_ethertype == ETHERNET_TYPE_DOT1AD) ?
+ (*main_intf)->dot1ad_vlans :
+ (*main_intf)->dot1q_vlans;
+ vlan_table = vec_elt_at_index (em->vlan_pool, vlan_table_id);
+ *vlan_intf = &vlan_table->vlans[outer_id];
+
+ qinq_table = vec_elt_at_index (em->qinq_pool, (*vlan_intf)->qinqs);
+ *qinq_intf = &qinq_table->vlans[inner_id];
+}
+
+
+// Determine the subinterface for this packet, given the result of the
+// vlan table lookups and vlan header parsing. Check the most specific
+// matches first.
+// Returns 1 if a matching subinterface was found, otherwise returns 0.
+always_inline u32
+eth_identify_subint (vnet_hw_interface_t * hi,
+ vlib_buffer_t * b0,
+ u32 match_flags,
+ main_intf_t * main_intf,
+ vlan_intf_t * vlan_intf,
+ qinq_intf_t * qinq_intf,
+ u32 * new_sw_if_index,
+ u8 * error0,
+ u32 * is_l2)
+{
+ subint_config_t * subint;
+
+ // Each comparison is checking both the valid flag and the number of tags
+ // (incorporating exact-match/non-exact-match).
+
+ // check for specific double tag
+ subint = &qinq_intf->subint;
+ if ((subint->flags & match_flags) == match_flags) goto matched;
+
+ // check for specific outer and 'any' inner
+ subint = &vlan_intf->inner_any_subint;
+ if ((subint->flags & match_flags) == match_flags) goto matched;
+
+ // check for specific single tag
+ subint = &vlan_intf->single_tag_subint;
+ if ((subint->flags & match_flags) == match_flags) goto matched;
+
+ // check for untagged interface
+ subint = &main_intf->untagged_subint;
+ if ((subint->flags & match_flags) == match_flags) goto matched;
+
+ // check for default interface
+ subint = &main_intf->default_subint;
+ if ((subint->flags & match_flags) == match_flags) goto matched;
+
+ // No matching subinterface
+ *new_sw_if_index = ~0;
+ *error0 = ETHERNET_ERROR_UNKNOWN_VLAN;
+ *is_l2 = 0;
+ return 0;
+
+ matched:
+ *new_sw_if_index = subint->sw_if_index;
+ *is_l2 = subint->flags & SUBINT_CONFIG_L2;
+ return 1;
+}
+
+always_inline ethernet_main_t *
+vnet_get_ethernet_main (void)
+{
+ return &ethernet_main;
+}
+
+void vnet_register_ip4_arp_resolution_event (vnet_main_t * vnm,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data);
+
+
+int vnet_add_del_ip4_arp_change_event (vnet_main_t * vnm,
+ void * data_callback,
+ u32 pid,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data, int is_add);
+
+vlib_node_registration_t ethernet_input_node;
+
+#endif /* included_ethernet_h */
diff --git a/vnet/vnet/ethernet/format.c b/vnet/vnet/ethernet/format.c
new file mode 100644
index 00000000000..be456d56d71
--- /dev/null
+++ b/vnet/vnet/ethernet/format.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_format.c: ethernet formatting/parsing.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+u8 * format_ethernet_address (u8 * s, va_list * args)
+{
+ ethernet_main_t * em = &ethernet_main;
+ u8 * a = va_arg (*args, u8 *);
+
+ if (em->format_ethernet_address_16bit)
+ return format (s, "%02x%02x.%02x%02x.%02x%02x",
+ a[0], a[1], a[2], a[3], a[4], a[5]);
+ else
+ return format (s, "%02x:%02x:%02x:%02x:%02x:%02x",
+ a[0], a[1], a[2], a[3], a[4], a[5]);
+}
+
+u8 * format_ethernet_type (u8 * s, va_list * args)
+{
+ ethernet_type_t type = va_arg (*args, u32);
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_type_info_t * t = ethernet_get_type_info (em, type);
+
+ if (t)
+ s = format (s, "%s", t->name);
+ else
+ s = format (s, "0x%04x", type);
+
+ return s;
+}
+
+u8 * format_ethernet_header_with_length (u8 * s, va_list * args)
+{
+ ethernet_max_header_t * m = va_arg (*args, ethernet_max_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_header_t * e = &m->ethernet;
+ ethernet_vlan_header_t * v;
+ ethernet_type_t type = clib_net_to_host_u16 (e->type);
+ u32 n_vlan = 0, i, header_bytes;
+ uword indent;
+
+ while (type == ETHERNET_TYPE_VLAN
+ && n_vlan < ARRAY_LEN (m->vlan))
+ {
+ v = m->vlan + n_vlan;
+ type = clib_net_to_host_u16 (v->type);
+ n_vlan++;
+ }
+
+ header_bytes = sizeof (e[0]) + n_vlan * sizeof (v[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "ethernet header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "%U: %U -> %U",
+ format_ethernet_type, type,
+ format_ethernet_address, e->src_address,
+ format_ethernet_address, e->dst_address);
+
+ for (i = 0; i < n_vlan; i++)
+ {
+ u32 v = clib_net_to_host_u16 (m->vlan[i].priority_cfi_and_id);
+ u32 vid = (v & 0xfff);
+ u32 cfi = (v >> 12) & 1;
+ u32 pri = (v >> 13);
+
+ s = format (s, " vlan %d", vid);
+ if (pri != 0)
+ s = format (s, " priority %d", pri);
+ if (cfi != 0)
+ s = format (s, " cfi");
+ }
+
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ethernet_type_info_t * ti;
+ vlib_node_t * node = 0;
+
+ ti = ethernet_get_type_info (em, type);
+ if (ti && ti->node_index != ~0)
+ node = vlib_get_node (em->vlib_main, ti->node_index);
+ if (node && node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) m + header_bytes,
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_ethernet_header (u8 * s, va_list * args)
+{
+ ethernet_max_header_t * m = va_arg (*args, ethernet_max_header_t *);
+ return format (s, "%U", format_ethernet_header_with_length, m, 0);
+}
+
+/* Parse X:X:X:X:X:X unix style ethernet address. */
+static uword
+unformat_ethernet_address_unix (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ u32 i, a[6];
+
+ if (! unformat (input, "%_%x:%x:%x:%x:%x:%x%_",
+ &a[0], &a[1], &a[2], &a[3], &a[4], &a[5]))
+ return 0;
+
+ /* Check range. */
+ for (i = 0; i < ARRAY_LEN (a); i++)
+ if (a[i] >= (1 << 8))
+ return 0;
+
+ for (i = 0; i < ARRAY_LEN (a); i++)
+ result[i] = a[i];
+
+ return 1;
+}
+
+/* Parse X.X.X cisco style ethernet address. */
+static uword
+unformat_ethernet_address_cisco (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ u32 i, a[3];
+
+ if (! unformat (input, "%_%x.%x.%x%_", &a[0], &a[1], &a[2]))
+ return 0;
+
+ /* Check range. */
+ for (i = 0; i < ARRAY_LEN (a); i++)
+ if (a[i] >= (1 << 16))
+ return 0;
+
+ result[0] = (a[0] >> 8) & 0xff;
+ result[1] = (a[0] >> 0) & 0xff;
+ result[2] = (a[1] >> 8) & 0xff;
+ result[3] = (a[1] >> 0) & 0xff;
+ result[4] = (a[2] >> 8) & 0xff;
+ result[5] = (a[2] >> 0) & 0xff;
+
+ return 1;
+}
+
+/* Parse ethernet address; accept either unix or style addresses. */
+uword
+unformat_ethernet_address (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ return (unformat_user (input, unformat_ethernet_address_unix, result)
+ || unformat_user (input, unformat_ethernet_address_cisco, result));
+}
+
+/* Returns ethernet type as an int in host byte order. */
+uword
+unformat_ethernet_type_host_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ ethernet_main_t * em = &ethernet_main;
+ int type, i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x", &type)
+ || unformat (input, "%d", &type))
+ {
+ if (type >= (1 << 16))
+ return 0;
+ *result = type;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ em->type_info_by_name, &i))
+ {
+ ethernet_type_info_t * ti = vec_elt_at_index (em->type_infos, i);
+ *result = ti->type;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_ethernet_type_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ if (! unformat_user (input, unformat_ethernet_type_host_byte_order, result))
+ return 0;
+
+ *result = clib_host_to_net_u16 ((u16) *result);
+ return 1;
+}
+
+uword
+unformat_ethernet_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ ethernet_max_header_t _m, * m = &_m;
+ ethernet_header_t * e = &m->ethernet;
+ u16 type;
+ u32 n_vlan;
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_ethernet_type_host_byte_order, &type,
+ unformat_ethernet_address, &e->src_address,
+ unformat_ethernet_address, &e->dst_address))
+ return 0;
+
+ n_vlan = 0;
+ while (unformat (input, "vlan"))
+ {
+ u32 id, priority;
+
+ if (! unformat_user (input, unformat_vlib_number, &id)
+ || id >= ETHERNET_N_VLAN)
+ return 0;
+
+ if (unformat (input, "priority %d", &priority))
+ {
+ if (priority >= 8)
+ return 0;
+ id |= priority << 13;
+ }
+
+ if (unformat (input, "cfi"))
+ id |= 1 << 12;
+
+ /* Too many vlans given. */
+ if (n_vlan >= ARRAY_LEN (m->vlan))
+ return 0;
+
+ m->vlan[n_vlan].priority_cfi_and_id = clib_host_to_net_u16 (id);
+ n_vlan++;
+ }
+
+ if (n_vlan == 0)
+ e->type = clib_host_to_net_u16 (type);
+ else
+ {
+ int i;
+
+ e->type = clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ for (i = 0; i < n_vlan - 1; i++)
+ m->vlan[i].type = clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ m->vlan[n_vlan - 1].type = clib_host_to_net_u16 (type);
+ }
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (e[0]) + n_vlan * sizeof (m->vlan[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, m, n_bytes);
+ }
+
+ return 1;
+}
diff --git a/vnet/vnet/ethernet/init.c b/vnet/vnet/ethernet/init.c
new file mode 100644
index 00000000000..4ac14e208be
--- /dev/null
+++ b/vnet/vnet/ethernet/init.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_init.c: ethernet initialization
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+/* Global main structure. */
+ethernet_main_t ethernet_main;
+
+static void add_type (ethernet_main_t * em,
+ ethernet_type_t type,
+ char * type_name)
+{
+ ethernet_type_info_t * ti;
+ u32 i;
+
+ vec_add2 (em->type_infos, ti, 1);
+ i = ti - em->type_infos;
+
+ ti->name = type_name;
+ ti->type = type;
+ ti->next_index = ti->node_index = ~0;
+
+ hash_set (em->type_info_by_type, type, i);
+ hash_set_mem (em->type_info_by_name, ti->name, i);
+}
+
+static clib_error_t * ethernet_init (vlib_main_t * vm)
+{
+ ethernet_main_t * em = &ethernet_main;
+ clib_error_t * error;
+
+ em->vlib_main = vm;
+
+ em->type_info_by_name = hash_create_string (0, sizeof (uword));
+ em->type_info_by_type = hash_create (0, sizeof (uword));
+
+#define ethernet_type(n,s) add_type (em, ETHERNET_TYPE_##s, #s);
+#include "types.def"
+#undef ethernet_type
+
+ if ((error = vlib_call_init_function (vm, llc_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ethernet_input_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ethernet_cli_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (ethernet_init);
+
+ethernet_main_t * ethernet_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, ethernet_init);
+ return &ethernet_main;
+}
+
diff --git a/vnet/vnet/ethernet/interface.c b/vnet/vnet/ethernet/interface.c
new file mode 100644
index 00000000000..fe7eb76bf46
--- /dev/null
+++ b/vnet/vnet/ethernet/interface.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_interface.c: ethernet interfaces
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+
+static uword ethernet_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ vnet_sw_interface_t * sub_sw = vnet_get_sw_interface (vnm, sw_if_index);
+ vnet_sw_interface_t * sup_sw = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_interface_t * ei;
+ ethernet_header_t * h = rewrite;
+ ethernet_type_t type;
+ uword n_bytes = sizeof (h[0]);
+
+ if (sub_sw != sup_sw) {
+ if (sub_sw->sub.eth.flags.one_tag) {
+ n_bytes += sizeof (ethernet_vlan_header_t);
+ } else if (sub_sw->sub.eth.flags.two_tags) {
+ n_bytes += 2 * (sizeof (ethernet_vlan_header_t));
+ }
+ // Check for encaps that are not supported for L3 interfaces
+ if (!(sub_sw->sub.eth.flags.exact_match) ||
+ (sub_sw->sub.eth.flags.default_sub) ||
+ (sub_sw->sub.eth.flags.outer_vlan_id_any) ||
+ (sub_sw->sub.eth.flags.inner_vlan_id_any)) {
+ return 0;
+ }
+ }
+
+ if (n_bytes > max_rewrite_bytes)
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: type = ETHERNET_TYPE_##b; break
+ _ (IP4, IP4);
+ _ (IP6, IP6);
+ _ (MPLS_UNICAST, MPLS_UNICAST);
+ _ (MPLS_MULTICAST, MPLS_MULTICAST);
+ _ (ARP, ARP);
+#undef _
+ default:
+ return 0;
+ }
+
+ ei = pool_elt_at_index (em->interfaces, hw->hw_instance);
+ memcpy (h->src_address, ei->address, sizeof (h->src_address));
+ if (dst_address)
+ memcpy (h->dst_address, dst_address, sizeof (h->dst_address));
+ else
+ memset (h->dst_address, ~0, sizeof (h->dst_address)); /* broadcast */
+
+ if (sub_sw->sub.eth.flags.one_tag) {
+ ethernet_vlan_header_t * outer = (void *) (h + 1);
+
+ h->type = sub_sw->sub.eth.flags.dot1ad ?
+ clib_host_to_net_u16 (ETHERNET_TYPE_DOT1AD) :
+ clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ outer->priority_cfi_and_id = clib_host_to_net_u16 (sub_sw->sub.eth.outer_vlan_id);
+ outer->type = clib_host_to_net_u16 (type);
+
+ } else if (sub_sw->sub.eth.flags.two_tags) {
+ ethernet_vlan_header_t * outer = (void *) (h + 1);
+ ethernet_vlan_header_t * inner = (void *) (outer + 1);
+
+ h->type = sub_sw->sub.eth.flags.dot1ad ?
+ clib_host_to_net_u16 (ETHERNET_TYPE_DOT1AD) :
+ clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ outer->priority_cfi_and_id = clib_host_to_net_u16 (sub_sw->sub.eth.outer_vlan_id);
+ outer->type = clib_host_to_net_u16 (ETHERNET_TYPE_VLAN);
+ inner->priority_cfi_and_id = clib_host_to_net_u16 (sub_sw->sub.eth.inner_vlan_id);
+ inner->type = clib_host_to_net_u16 (type);
+
+ } else {
+ h->type = clib_host_to_net_u16 (type);
+ }
+
+ return n_bytes;
+}
+
+VNET_HW_INTERFACE_CLASS (ethernet_hw_interface_class) = {
+ .name = "Ethernet",
+ .format_address = format_ethernet_address,
+ .format_header = format_ethernet_header_with_length,
+ .unformat_hw_address = unformat_ethernet_address,
+ .unformat_header = unformat_ethernet_header,
+ .set_rewrite = ethernet_set_rewrite,
+};
+
+uword unformat_ethernet_interface (unformat_input_t * input, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ u32 * result = va_arg (*args, u32 *);
+ u32 hw_if_index;
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_interface_t * eif;
+
+ if (! unformat_user (input, unformat_vnet_hw_interface, vnm, &hw_if_index))
+ return 0;
+
+ eif = ethernet_get_interface (em, hw_if_index);
+ if (eif)
+ {
+ *result = hw_if_index;
+ return 1;
+ }
+ return 0;
+}
+
+clib_error_t *
+ethernet_register_interface (vnet_main_t * vnm,
+ u32 dev_class_index,
+ u32 dev_instance,
+ u8 * address,
+ u32 * hw_if_index_return,
+ ethernet_flag_change_function_t flag_change)
+{
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_interface_t * ei;
+ vnet_hw_interface_t * hi;
+ clib_error_t * error = 0;
+ u32 hw_if_index;
+
+ pool_get (em->interfaces, ei);
+ ei->flag_change = flag_change;
+
+ hw_if_index = vnet_register_interface
+ (vnm,
+ dev_class_index, dev_instance,
+ ethernet_hw_interface_class.index,
+ ei - em->interfaces);
+ *hw_if_index_return = hw_if_index;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ ethernet_setup_node (vnm->vlib_main, hi->output_node_index);
+
+ hi->min_packet_bytes = ETHERNET_MIN_PACKET_BYTES;
+ hi->max_packet_bytes = ETHERNET_MAX_PACKET_BYTES;
+ hi->per_packet_overhead_bytes =
+ /* preamble */ 8 + /* inter frame gap */ 12;
+
+ /* Standard default ethernet MTU. */
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 1500;
+
+ memcpy (ei->address, address, sizeof (ei->address));
+ vec_free (hi->hw_address);
+ vec_add (hi->hw_address, address, sizeof (ei->address));
+
+ if (error)
+ {
+ pool_put (em->interfaces, ei);
+ return error;
+ }
+ return error;
+}
+
+void
+ethernet_delete_interface (vnet_main_t * vnm, u32 hw_if_index)
+{
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_interface_t * ei;
+ vnet_hw_interface_t * hi;
+ main_intf_t * main_intf;
+ vlan_table_t * vlan_table;
+ u32 idx;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ ei = pool_elt_at_index (em->interfaces, hi->hw_instance);
+
+ /* Delete vlan mapping table for dot1q and dot1ad. */
+ main_intf = vec_elt_at_index (em->main_intfs, hi->hw_if_index);
+ if (main_intf->dot1q_vlans) {
+ vlan_table = vec_elt_at_index (em->vlan_pool, main_intf->dot1q_vlans);
+ for (idx=0; idx<ETHERNET_N_VLAN; idx++ ) {
+ if (vlan_table->vlans[idx].qinqs) {
+ pool_put_index(em->qinq_pool, vlan_table->vlans[idx].qinqs);
+ }
+ }
+ pool_put_index(em->vlan_pool, main_intf->dot1q_vlans);
+ }
+ if (main_intf->dot1ad_vlans) {
+ vlan_table = vec_elt_at_index (em->vlan_pool, main_intf->dot1ad_vlans);
+ for (idx=0; idx<ETHERNET_N_VLAN; idx++ ) {
+ if (vlan_table->vlans[idx].qinqs) {
+ pool_put_index(em->qinq_pool, vlan_table->vlans[idx].qinqs);
+ }
+ }
+ pool_put_index(em->vlan_pool, main_intf->dot1ad_vlans);
+ }
+
+ vnet_delete_hw_interface (vnm, hw_if_index);
+ pool_put (em->interfaces, ei);
+}
+
+u32
+ethernet_set_flags (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ ethernet_main_t * em = &ethernet_main;
+ vnet_hw_interface_t * hi;
+ ethernet_interface_t * ei;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ ASSERT (hi->hw_class_index == ethernet_hw_interface_class.index);
+
+ ei = pool_elt_at_index (em->interfaces, hi->hw_instance);
+ if (ei->flag_change)
+ return ei->flag_change (vnm, hi, flags);
+ return (u32)~0;
+}
+
+#define VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT VNET_INTERFACE_TX_N_NEXT
+
+/* Echo packets back to ethernet input. */
+static uword
+simulated_ethernet_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, n_left_to_next, n_copy, * from, * to_next;
+ u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT;
+ u32 i;
+ vlib_buffer_t * b;
+
+ n_left_from = frame->n_vectors;
+ from = vlib_frame_args (frame);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ n_copy = clib_min (n_left_from, n_left_to_next);
+
+ memcpy (to_next, from, n_copy * sizeof (from[0]));
+ n_left_to_next -= n_copy;
+ n_left_from -= n_copy;
+ for (i = 0; i < n_copy; i++)
+ {
+ b = vlib_get_buffer (vm, from[i]);
+ /* Set up RX and TX indices as if received from a real driver */
+ vnet_buffer (b)->sw_if_index[VLIB_RX] =
+ vnet_buffer (b)->sw_if_index[VLIB_TX];
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32) ~0;
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return n_left_from;
+}
+
+static u8 * format_simulated_ethernet_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "loop%d", dev_instance);
+}
+
+VNET_DEVICE_CLASS (ethernet_simulated_device_class) = {
+ .name = "Loopback",
+ .format_device_name = format_simulated_ethernet_name,
+ .tx_function = simulated_ethernet_interface_tx,
+};
+
+int vnet_create_loopback_interface (u32 * sw_if_indexp, u8 *mac_address)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = vlib_get_main();
+ clib_error_t * error;
+ static u32 instance;
+ u8 address[6];
+ u32 hw_if_index;
+ vnet_hw_interface_t * hw_if;
+ u32 slot;
+ int rv = 0;
+
+ ASSERT(sw_if_indexp);
+
+ *sw_if_indexp = (u32)~0;
+
+ memset (address, 0, sizeof (address));
+
+ /*
+ * Default MAC address (dead:0000:0000 + instance) is allocated
+ * if zero mac_address is configured. Otherwise, user-configurable MAC
+ * address is programmed on the loopback interface.
+ */
+ if (memcmp (address, mac_address, sizeof (address)))
+ memcpy (address, mac_address, sizeof (address));
+ else
+ {
+ address[0] = 0xde;
+ address[1] = 0xad;
+ address[5] = instance;
+ }
+
+ error = ethernet_register_interface
+ (vnm,
+ ethernet_simulated_device_class.index,
+ instance++,
+ address,
+ &hw_if_index,
+ /* flag change */ 0);
+
+ if (error)
+ {
+ rv = VNET_API_ERROR_INVALID_REGISTRATION;
+ clib_error_report(error);
+ return rv;
+ }
+
+ hw_if = vnet_get_hw_interface (vnm, hw_if_index);
+ slot = vlib_node_add_named_next_with_slot
+ (vm, hw_if->tx_node_index,
+ "ethernet-input",
+ VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT);
+ ASSERT (slot == VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT);
+
+ {
+ vnet_sw_interface_t * si = vnet_get_hw_sw_interface (vnm, hw_if_index);
+ *sw_if_indexp = si->sw_if_index;
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+create_simulated_ethernet_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int rv;
+ u32 sw_if_index;
+ u8 mac_address[6];
+
+ memset (mac_address, 0, sizeof (mac_address));
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "mac %U", unformat_ethernet_address, mac_address))
+ ;
+ else
+ break;
+ }
+
+ rv = vnet_create_loopback_interface (&sw_if_index, mac_address);
+
+ if (rv)
+ return clib_error_return (0, "vnet_create_loopback_interface failed");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_simulated_ethernet_interface_command, static) = {
+ .path = "loopback create-interface",
+ .short_help = "Create Loopback ethernet interface [mac <mac-addr>]",
+ .function = create_simulated_ethernet_interfaces,
+};
+
+ethernet_interface_t *
+ethernet_get_interface (ethernet_main_t * em, u32 hw_if_index)
+{
+ vnet_hw_interface_t * i = vnet_get_hw_interface (vnet_get_main(), hw_if_index);
+ return (i->hw_class_index == ethernet_hw_interface_class.index
+ ? pool_elt_at_index (em->interfaces, i->hw_instance)
+ : 0);
+}
+
+int vnet_delete_loopback_interface (u32 sw_if_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_sw_interface_t * si;
+
+ if (pool_is_free_index (vnm->interface_main.sw_interfaces,
+ sw_if_index))
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+ ethernet_delete_interface (vnm, si->hw_if_index);
+
+ return 0;
+}
+
+static clib_error_t *
+delete_simulated_ethernet_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int rv;
+ u32 sw_if_index = ~0;
+ vnet_main_t * vnm = vnet_get_main();
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ ;
+ else
+ break;
+ }
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "interface not specified");
+
+ rv = vnet_delete_loopback_interface (sw_if_index);
+
+ if (rv)
+ return clib_error_return (0, "vnet_delete_loopback_interface failed");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (delete_simulated_ethernet_interface_command, static) = {
+ .path = "loopback delete-interface",
+ .short_help = "Delete Loopback ethernet interface intfc <interface>",
+ .function = delete_simulated_ethernet_interfaces,
+};
diff --git a/vnet/vnet/ethernet/mac_swap.c b/vnet/vnet/ethernet/mac_swap.c
new file mode 100644
index 00000000000..6bb7f5b0b02
--- /dev/null
+++ b/vnet/vnet/ethernet/mac_swap.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/error.h>
+#include <vnet/devices/pci/ige.h>
+#include <vnet/devices/pci/ixge.h>
+#include <vnet/devices/pci/ixgev.h>
+
+typedef struct {
+ u32 cached_next_index;
+ u32 cached_sw_if_index;
+
+ /* Hash table to map sw_if_index to next node index */
+ uword * next_node_index_by_sw_if_index;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} mac_swap_main_t;
+
+typedef struct {
+ u8 src[6];
+ u8 dst[6];
+ u32 sw_if_index;
+ u32 next_index;
+} swap_trace_t;
+
+/* packet trace format function */
+static u8 * format_swap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ swap_trace_t * t = va_arg (*args, swap_trace_t *);
+
+ s = format (s, "SWAP: dst now %U src now %U sw_if_index %d next_index %d",
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->sw_if_index,
+ t->next_index);
+ return s;
+}
+
+#define foreach_hw_driver_next \
+ _(IP4) \
+ _(IP6) \
+ _(ETHERNET)
+
+mac_swap_main_t mac_swap_main;
+
+static vlib_node_registration_t mac_swap_node;
+
+#define foreach_mac_swap_error \
+_(SWAPS, "mac addresses swapped")
+
+typedef enum {
+#define _(sym,str) MAC_SWAP_ERROR_##sym,
+ foreach_mac_swap_error
+#undef _
+ MAC_SWAP_N_ERROR,
+} mac_swap_error_t;
+
+static char * mac_swap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_mac_swap_error
+#undef _
+};
+
+/*
+ * To drop a pkt and increment one of the previous counters:
+ *
+ * set b0->error = error_node->errors[RANDOM_ERROR_SAMPLE];
+ * set next0 to a disposition index bound to "error-drop".
+ *
+ * To manually increment the specific counter MAC_SWAP_ERROR_SAMPLE:
+ *
+ * vlib_node_t *n = vlib_get_node (vm, mac_swap.index);
+ * u32 node_counter_base_index = n->error_heap_index;
+ * vlib_error_main_t * em = &vm->error_main;
+ * em->counters[node_counter_base_index + MAC_SWAP_ERROR_SAMPLE] += 1;
+ *
+ */
+
+typedef enum {
+ MAC_SWAP_NEXT_DROP,
+ MAC_SWAP_N_NEXT,
+} mac_swap_next_t;
+
+static uword
+mac_swap_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ mac_swap_next_t next_index;
+ mac_swap_main_t * msm = &mac_swap_main;
+ vlib_node_t *n = vlib_get_node (vm, mac_swap_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ uword * p0, * p1;
+ u64 tmp0a, tmp0b;
+ u64 tmp1a, tmp1b;
+ ethernet_header_t * h0, *h1;
+
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = msm->cached_next_index;
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ next1 = msm->cached_next_index;
+
+ if (PREDICT_FALSE (msm->cached_sw_if_index != sw_if_index0))
+ {
+ p0 = hash_get (msm->next_node_index_by_sw_if_index, sw_if_index0);
+ if (p0 == 0)
+ {
+ vnet_hw_interface_t *hw0;
+
+ hw0 = vnet_get_sup_hw_interface (msm->vnet_main,
+ sw_if_index0);
+
+ next0 = vlib_node_add_next (msm->vlib_main,
+ mac_swap_node.index,
+ hw0->output_node_index);
+ hash_set (msm->next_node_index_by_sw_if_index,
+ sw_if_index0, next0);
+ }
+ else
+ next0 = p0[0];
+ msm->cached_sw_if_index = sw_if_index0;
+ msm->cached_next_index = next0;
+ next1 = next0;
+ }
+ if (PREDICT_FALSE (msm->cached_sw_if_index != sw_if_index1))
+ {
+ p1 = hash_get (msm->next_node_index_by_sw_if_index, sw_if_index1);
+ if (p1 == 0)
+ {
+ vnet_hw_interface_t *hw1;
+
+ hw1 = vnet_get_sup_hw_interface (msm->vnet_main,
+ sw_if_index1);
+
+ next1 = vlib_node_add_next (msm->vlib_main,
+ mac_swap_node.index,
+ hw1->output_node_index);
+ hash_set (msm->next_node_index_by_sw_if_index,
+ sw_if_index1, next1);
+ }
+ else
+ next1 = p1[0];
+ msm->cached_sw_if_index = sw_if_index1;
+ msm->cached_next_index = next1;
+ }
+
+ em->counters[node_counter_base_index + MAC_SWAP_ERROR_SWAPS] += 2;
+
+ /* reset buffer so we always point at the MAC hdr */
+ vlib_buffer_reset (b0);
+ vlib_buffer_reset (b1);
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ /* Swap 2 x src and dst mac addresses using 8-byte load/stores */
+ tmp0a = clib_net_to_host_u64(((u64 *)(h0->dst_address))[0]);
+ tmp1a = clib_net_to_host_u64(((u64 *)(h1->dst_address))[0]);
+ tmp0b = clib_net_to_host_u64(((u64 *)(h0->src_address))[0]);
+ tmp1b = clib_net_to_host_u64(((u64 *)(h1->src_address))[0]);
+ ((u64 *)(h0->dst_address))[0] = clib_host_to_net_u64(tmp0b);
+ ((u64 *)(h1->dst_address))[0] = clib_host_to_net_u64(tmp1b);
+ /* Move the ethertype from "b" to "a" */
+ tmp0a &= ~(0xFFFF);
+ tmp1a &= ~(0xFFFF);
+ tmp0a |= tmp0b & 0xFFFF;
+ ((u64 *)(h0->src_address))[0] = clib_host_to_net_u64(tmp0a);
+ tmp1a |= tmp1b & 0xFFFF;
+ ((u64 *)(h1->src_address))[0] = clib_host_to_net_u64(tmp1a);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ swap_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ memcpy (t->src, h0->src_address, 6);
+ memcpy (t->dst, h0->dst_address, 6);
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ swap_trace_t *t = vlib_add_trace (vm, node, b1, sizeof (*t));
+ memcpy (t->src, h1->src_address, 6);
+ memcpy (t->dst, h1->dst_address, 6);
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ uword * p0;
+ u64 tmp0a, tmp0b;
+ ethernet_header_t * h0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = msm->cached_next_index;
+
+ if (PREDICT_FALSE (msm->cached_sw_if_index != sw_if_index0))
+ {
+ p0 = hash_get (msm->next_node_index_by_sw_if_index, sw_if_index0);
+ if (p0 == 0)
+ {
+ vnet_hw_interface_t *hw0;
+
+ hw0 = vnet_get_sup_hw_interface (msm->vnet_main,
+ sw_if_index0);
+
+ next0 = vlib_node_add_next (msm->vlib_main,
+ mac_swap_node.index,
+ hw0->output_node_index);
+ hash_set (msm->next_node_index_by_sw_if_index,
+ sw_if_index0, next0);
+ }
+ else
+ next0 = p0[0];
+ msm->cached_sw_if_index = sw_if_index0;
+ msm->cached_next_index = next0;
+ }
+
+ em->counters[node_counter_base_index + MAC_SWAP_ERROR_SWAPS] += 1;
+
+ /* reset buffer so we always point at the MAC hdr */
+ vlib_buffer_reset (b0);
+ h0 = vlib_buffer_get_current (b0);
+
+ /* Exchange src and dst, preserve the ethertype */
+ tmp0a = clib_net_to_host_u64(((u64 *)(h0->dst_address))[0]);
+ tmp0b = clib_net_to_host_u64(((u64 *)(h0->src_address))[0]);
+ ((u64 *)(h0->dst_address))[0] = clib_host_to_net_u64(tmp0b);
+ tmp0a &= ~(0xFFFF);
+ tmp0a |= tmp0b & 0xFFFF;
+ ((u64 *)(h0->src_address))[0] = clib_host_to_net_u64(tmp0a);
+
+ /* ship it */
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ swap_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ memcpy (t->src, h0->src_address, 6);
+ memcpy (t->dst, h0->dst_address, 6);
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mac_swap_node,static) = {
+ .function = mac_swap_node_fn,
+ .name = "mac-swap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_swap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(mac_swap_error_strings),
+ .error_strings = mac_swap_error_strings,
+
+ .n_next_nodes = MAC_SWAP_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [MAC_SWAP_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *mac_swap_init (vlib_main_t *vm)
+{
+ mac_swap_main_t * msm = &mac_swap_main;
+
+ msm->next_node_index_by_sw_if_index = hash_create (0, sizeof (uword));
+ msm->cached_next_index = (u32)~0;
+ msm->cached_sw_if_index = (u32)~0;
+ msm->vlib_main = vm;
+ msm->vnet_main = vnet_get_main();
+
+ /* Driver RX nodes send pkts here... */
+#define _(a) ixge_set_next_node (IXGE_RX_NEXT_##a##_INPUT, "mac-swap");
+ foreach_hw_driver_next
+#undef _
+#define _(a) ixgev_set_next_node (IXGEV_RX_NEXT_##a##_INPUT, "mac-swap");
+ foreach_hw_driver_next
+#undef _
+#define _(a) ige_set_next_node (IGE_RX_NEXT_##a##_INPUT, "mac-swap");
+ foreach_hw_driver_next
+#undef _
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (mac_swap_init);
+
diff --git a/vnet/vnet/ethernet/node.c b/vnet/vnet/ethernet/node.c
new file mode 100644
index 00000000000..9c943992b68
--- /dev/null
+++ b/vnet/vnet/ethernet/node.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_node.c: ethernet packet processing
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/sparse_vec.h>
+#include <vnet/l2/l2_bvi.h>
+
+
+#define foreach_ethernet_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop") \
+ _ (LLC, "llc-input")
+
+typedef enum {
+#define _(s,n) ETHERNET_INPUT_NEXT_##s,
+ foreach_ethernet_input_next
+#undef _
+ ETHERNET_INPUT_N_NEXT,
+} ethernet_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} ethernet_input_trace_t;
+
+static u8 * format_ethernet_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ethernet_input_trace_t * t = va_arg (*va, ethernet_input_trace_t *);
+
+ s = format (s, "%U", format_ethernet_header, t->packet_data);
+
+ return s;
+}
+
+vlib_node_registration_t ethernet_input_node;
+
+typedef enum {
+ ETHERNET_INPUT_VARIANT_ETHERNET,
+ ETHERNET_INPUT_VARIANT_ETHERNET_TYPE,
+ ETHERNET_INPUT_VARIANT_VLAN,
+ ETHERNET_INPUT_VARIANT_NOT_L2,
+} ethernet_input_variant_t;
+
+
+// Compare two ethernet macs. Return 1 if they are the same, 0 if different
+static_always_inline u32
+eth_mac_equal (u8 * mac1, u8 * mac2) {
+ return (*((u32 *)(mac1+0)) == *((u32 *)(mac2+0)) &&
+ *((u32 *)(mac1+2)) == *((u32 *)(mac2+2)));
+}
+
+
+// Parse the ethernet header to extract vlan tags and innermost ethertype
+static_always_inline void
+parse_header (ethernet_input_variant_t variant,
+ vlib_buffer_t * b0,
+ u16 * type,
+ u16 * orig_type,
+ u16 * outer_id,
+ u16 * inner_id,
+ u32 * match_flags) {
+
+ if (variant == ETHERNET_INPUT_VARIANT_ETHERNET
+ || variant == ETHERNET_INPUT_VARIANT_NOT_L2) {
+ ethernet_header_t * e0;
+
+ e0 = (void *) (b0->data + b0->current_data);
+
+ vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data;
+
+ vlib_buffer_advance (b0, sizeof (e0[0]));
+
+ *type = clib_net_to_host_u16(e0->type);
+ } else if (variant == ETHERNET_INPUT_VARIANT_ETHERNET_TYPE) {
+ // here when prior node was LLC/SNAP processing
+ u16 * e0;
+
+ e0 = (void *) (b0->data + b0->current_data);
+
+ vlib_buffer_advance (b0, sizeof (e0[0]));
+
+ *type = clib_net_to_host_u16(e0[0]);
+ }
+
+ // save for distinguishing between dot1q and dot1ad later
+ *orig_type = *type;
+
+ // default the tags to 0 (used if there is no corresponding tag)
+ *outer_id = 0;
+ *inner_id = 0;
+
+ *match_flags = SUBINT_CONFIG_VALID | SUBINT_CONFIG_MATCH_0_TAG;
+
+ // check for vlan encaps
+ if ((*type == ETHERNET_TYPE_VLAN) ||
+ (*type == ETHERNET_TYPE_DOT1AD) ||
+ (*type == ETHERNET_TYPE_VLAN_9100) ||
+ (*type == ETHERNET_TYPE_VLAN_9200))
+ {
+ ethernet_vlan_header_t * h0;
+ u16 tag;
+
+ *match_flags = SUBINT_CONFIG_VALID | SUBINT_CONFIG_MATCH_1_TAG;
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ tag = clib_net_to_host_u16 (h0->priority_cfi_and_id);
+
+ *outer_id = tag & 0xfff;
+
+ *type = clib_net_to_host_u16(h0->type);
+
+ vlib_buffer_advance (b0, sizeof (h0[0]));
+
+ if (*type == ETHERNET_TYPE_VLAN) {
+ // Double tagged packet
+ *match_flags = SUBINT_CONFIG_VALID | SUBINT_CONFIG_MATCH_2_TAG;
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ tag = clib_net_to_host_u16 (h0->priority_cfi_and_id);
+
+ *inner_id = tag & 0xfff;
+
+ *type = clib_net_to_host_u16(h0->type);
+
+ vlib_buffer_advance (b0, sizeof (h0[0]));
+
+ if (*type == ETHERNET_TYPE_VLAN) {
+ // More than double tagged packet
+ *match_flags = SUBINT_CONFIG_VALID | SUBINT_CONFIG_MATCH_3_TAG;
+ }
+ }
+ }
+}
+
+// Determine the subinterface for this packet, given the result of the
+// vlan table lookups and vlan header parsing. Check the most specific
+// matches first.
+static_always_inline void
+identify_subint (vnet_hw_interface_t * hi,
+ vlib_buffer_t * b0,
+ u32 match_flags,
+ main_intf_t * main_intf,
+ vlan_intf_t * vlan_intf,
+ qinq_intf_t * qinq_intf,
+ u32 * new_sw_if_index,
+ u8 * error0,
+ u32 * is_l2)
+{
+ u32 matched;
+
+ matched = eth_identify_subint (hi, b0, match_flags,
+ main_intf, vlan_intf, qinq_intf,
+ new_sw_if_index, error0, is_l2);
+
+ if (matched) {
+
+ // Perform L3 my-mac filter
+ // A unicast packet arriving on an L3 interface must have a dmac matching the interface mac.
+ // This is required for promiscuous mode, else we will forward packets we aren't supposed to.
+ if (!(*is_l2)) {
+ ethernet_header_t * e0;
+ e0 = (void *) (b0->data + vnet_buffer (b0)->ethernet.start_of_ethernet_header);
+
+ if (!(ethernet_address_cast(e0->dst_address))) {
+ if (!eth_mac_equal((u8 *)e0, hi->hw_address)) {
+ *error0 = ETHERNET_ERROR_L3_MAC_MISMATCH;
+ }
+ }
+ }
+
+ // Check for down subinterface
+ *error0 = (*new_sw_if_index) != ~0 ? (*error0) : ETHERNET_ERROR_DOWN;
+ }
+}
+
+static_always_inline void
+determine_next_node (ethernet_main_t * em,
+ ethernet_input_variant_t variant,
+ u32 is_l20,
+ u32 type0,
+ vlib_buffer_t * b0,
+ u8 * error0,
+ u8 * next0)
+{
+ if (PREDICT_FALSE (*error0 != ETHERNET_ERROR_NONE)) {
+ // some error occurred
+ *next0 = ETHERNET_INPUT_NEXT_DROP;
+ } else if (is_l20) {
+ *next0 = em->l2_next;
+ // record the L2 len and reset the buffer so the L2 header is preserved
+ vnet_buffer(b0)->l2.l2_len = b0->current_data;
+ vlib_buffer_advance (b0, -(b0->current_data));
+
+ // check for common IP/MPLS ethertypes
+ } else if (type0 == ETHERNET_TYPE_IP4) {
+ *next0 = em->l3_next.input_next_ip4;
+ } else if (type0 == ETHERNET_TYPE_IP6) {
+ *next0 = em->l3_next.input_next_ip6;
+ } else if (type0 == ETHERNET_TYPE_MPLS_UNICAST) {
+ *next0 = em->l3_next.input_next_mpls;
+
+ } else if (em->redirect_l3) {
+ // L3 Redirect is on, the cached common next nodes will be
+ // pointing to the redirect node, catch the uncommon types here
+ *next0 = em->redirect_l3_next;
+ } else {
+ // uncommon ethertype, check table
+ u32 i0;
+ i0 = sparse_vec_index (em->l3_next.input_next_by_type, type0);
+ *next0 = vec_elt (em->l3_next.input_next_by_type, i0);
+ *error0 = i0 == SPARSE_VEC_INVALID_INDEX ? ETHERNET_ERROR_UNKNOWN_TYPE : *error0;
+
+ // The table is not populated with LLC values, so check that now.
+ // If variant is variant_ethernet then we came from LLC processing. Don't
+ // go back there; drop instead using by keeping the drop/bad table result.
+ if ((type0 < 0x600) && (variant == ETHERNET_INPUT_VARIANT_ETHERNET)) {
+ *next0 = ETHERNET_INPUT_NEXT_LLC;
+ }
+ }
+}
+
+static_always_inline uword
+ethernet_input_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame,
+ ethernet_input_variant_t variant)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ethernet_main_t * em = &ethernet_main;
+ vlib_node_runtime_t * error_node;
+ u32 n_left_from, next_index, * from, * to_next;
+ u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
+ u32 cpu_index = os_get_cpu_number();
+
+ if (variant != ETHERNET_INPUT_VARIANT_ETHERNET)
+ error_node = vlib_node_get_runtime (vm, ethernet_input_node.index);
+ else
+ error_node = node;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (ethernet_input_trace_t));
+
+ next_index = node->cached_next_index;
+ stats_sw_if_index = node->runtime_data[0];
+ stats_n_packets = stats_n_bytes = 0;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u8 next0, next1, error0, error1;
+ u16 type0, orig_type0, type1, orig_type1;
+ u16 outer_id0, inner_id0, outer_id1, inner_id1;
+ u32 match_flags0, match_flags1;
+ u32 old_sw_if_index0, new_sw_if_index0, len0, old_sw_if_index1, new_sw_if_index1, len1;
+ vnet_hw_interface_t * hi0, * hi1;
+ main_intf_t * main_intf0, * main_intf1;
+ vlan_intf_t * vlan_intf0, * vlan_intf1;
+ qinq_intf_t * qinq_intf0, * qinq_intf1;
+ u32 is_l20, is_l21;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, STORE);
+ vlib_prefetch_buffer_header (b3, STORE);
+
+ CLIB_PREFETCH (b2->data, sizeof (ethernet_header_t), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (ethernet_header_t), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ error0 = error1 = ETHERNET_ERROR_NONE;
+
+ parse_header (variant,
+ b0,
+ &type0,
+ &orig_type0,
+ &outer_id0,
+ &inner_id0,
+ &match_flags0);
+
+ parse_header (variant,
+ b1,
+ &type1,
+ &orig_type1,
+ &outer_id1,
+ &inner_id1,
+ &match_flags1);
+
+ old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ old_sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ eth_vlan_table_lookups (em,
+ vnm,
+ old_sw_if_index0,
+ orig_type0,
+ outer_id0,
+ inner_id0,
+ &hi0,
+ &main_intf0,
+ &vlan_intf0,
+ &qinq_intf0);
+
+ eth_vlan_table_lookups (em,
+ vnm,
+ old_sw_if_index1,
+ orig_type1,
+ outer_id1,
+ inner_id1,
+ &hi1,
+ &main_intf1,
+ &vlan_intf1,
+ &qinq_intf1);
+
+ identify_subint (hi0,
+ b0,
+ match_flags0,
+ main_intf0,
+ vlan_intf0,
+ qinq_intf0,
+ &new_sw_if_index0,
+ &error0,
+ &is_l20);
+
+ identify_subint (hi1,
+ b1,
+ match_flags1,
+ main_intf1,
+ vlan_intf1,
+ qinq_intf1,
+ &new_sw_if_index1,
+ &error1,
+ &is_l21);
+
+ // Save RX sw_if_index for later nodes
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = error0 != ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = error1 != ETHERNET_ERROR_NONE ? old_sw_if_index1 : new_sw_if_index1;
+
+ // Check if there is a stat to take (valid and non-main sw_if_index for pkt 0 or pkt 1)
+ if (((new_sw_if_index0 != ~0) && (new_sw_if_index0 != old_sw_if_index0)) ||
+ ((new_sw_if_index1 != ~0) && (new_sw_if_index1 != old_sw_if_index1))) {
+
+ len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
+ - vnet_buffer (b0)->ethernet.start_of_ethernet_header;
+ len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data
+ - vnet_buffer (b1)->ethernet.start_of_ethernet_header;
+
+ stats_n_packets += 2;
+ stats_n_bytes += len0 + len1;
+
+ if (PREDICT_FALSE (! (new_sw_if_index0 == stats_sw_if_index && new_sw_if_index1 == stats_sw_if_index)))
+ {
+ stats_n_packets -= 2;
+ stats_n_bytes -= len0 + len1;
+
+ if (new_sw_if_index0 != old_sw_if_index0 && new_sw_if_index0 != ~0)
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ new_sw_if_index0,
+ 1,
+ len0);
+ if (new_sw_if_index1 != old_sw_if_index1 && new_sw_if_index1 != ~0)
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ new_sw_if_index1,
+ 1,
+ len1);
+
+ if (new_sw_if_index0 == new_sw_if_index1)
+ {
+ if (stats_n_packets > 0)
+ {
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ stats_sw_if_index,
+ stats_n_packets,
+ stats_n_bytes);
+ stats_n_packets = stats_n_bytes = 0;
+ }
+ stats_sw_if_index = new_sw_if_index0;
+ }
+ }
+ }
+
+ if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
+ is_l20 = is_l21 = 0;
+
+ determine_next_node(em, variant, is_l20, type0, b0, &error0, &next0);
+ determine_next_node(em, variant, is_l21, type1, b1, &error1, &next1);
+
+ b0->error = error_node->errors[error0];
+ b1->error = error_node->errors[error1];
+
+ // verify speculative enqueue
+ vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u8 error0, next0;
+ u16 type0, orig_type0;
+ u16 outer_id0, inner_id0;
+ u32 match_flags0;
+ u32 old_sw_if_index0, new_sw_if_index0, len0;
+ vnet_hw_interface_t * hi0;
+ main_intf_t * main_intf0;
+ vlan_intf_t * vlan_intf0;
+ qinq_intf_t * qinq_intf0;
+ u32 is_l20;
+
+ // Prefetch next iteration
+ if (n_left_from > 1) {
+ vlib_buffer_t * p2;
+
+ p2 = vlib_get_buffer (vm, from[1]);
+ vlib_prefetch_buffer_header (p2, STORE);
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ error0 = ETHERNET_ERROR_NONE;
+
+ parse_header (variant,
+ b0,
+ &type0,
+ &orig_type0,
+ &outer_id0,
+ &inner_id0,
+ &match_flags0);
+
+ old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ eth_vlan_table_lookups (em,
+ vnm,
+ old_sw_if_index0,
+ orig_type0,
+ outer_id0,
+ inner_id0,
+ &hi0,
+ &main_intf0,
+ &vlan_intf0,
+ &qinq_intf0);
+
+ identify_subint (hi0,
+ b0,
+ match_flags0,
+ main_intf0,
+ vlan_intf0,
+ qinq_intf0,
+ &new_sw_if_index0,
+ &error0,
+ &is_l20);
+
+ // Save RX sw_if_index for later nodes
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = error0 != ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;
+
+ // Increment subinterface stats
+ // Note that interface-level counters have already been incremented
+ // prior to calling this function. Thus only subinterface counters
+ // are incremented here.
+ //
+ // Interface level counters include packets received on the main
+ // interface and all subinterfaces. Subinterface level counters
+ // include only those packets received on that subinterface
+ // Increment stats if the subint is valid and it is not the main intf
+ if ((new_sw_if_index0 != ~0) && (new_sw_if_index0 != old_sw_if_index0)) {
+
+ len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
+ - vnet_buffer (b0)->ethernet.start_of_ethernet_header;
+
+ stats_n_packets += 1;
+ stats_n_bytes += len0;
+
+ // Batch stat increments from the same subinterface so counters
+ // don't need to be incremented for every packet.
+ if (PREDICT_FALSE (new_sw_if_index0 != stats_sw_if_index)) {
+ stats_n_packets -= 1;
+ stats_n_bytes -= len0;
+
+ if (new_sw_if_index0 != ~0)
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ new_sw_if_index0,
+ 1,
+ len0);
+ if (stats_n_packets > 0) {
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ stats_sw_if_index,
+ stats_n_packets,
+ stats_n_bytes);
+ stats_n_packets = stats_n_bytes = 0;
+ }
+ stats_sw_if_index = new_sw_if_index0;
+ }
+ }
+
+ if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
+ is_l20 = 0;
+
+ determine_next_node(em, variant, is_l20, type0, b0, &error0, &next0);
+
+ b0->error = error_node->errors[error0];
+
+ // verify speculative enqueue
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ // Increment any remaining batched stats
+ if (stats_n_packets > 0)
+ {
+ vlib_increment_combined_counter
+ (vnm->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ stats_sw_if_index,
+ stats_n_packets,
+ stats_n_bytes);
+ node->runtime_data[0] = stats_sw_if_index;
+ }
+
+ return from_frame->n_vectors;
+}
+
+static uword
+ethernet_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{ return ethernet_input_inline (vm, node, from_frame, ETHERNET_INPUT_VARIANT_ETHERNET); }
+
+static uword
+ethernet_input_type (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{ return ethernet_input_inline (vm, node, from_frame, ETHERNET_INPUT_VARIANT_ETHERNET_TYPE); }
+
+static uword
+ethernet_input_not_l2 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{ return ethernet_input_inline (vm, node, from_frame, ETHERNET_INPUT_VARIANT_NOT_L2); }
+
+
+// Return the subinterface config struct for the given sw_if_index
+// Also return via parameter the appropriate match flags for the
+// configured number of tags.
+// On error (unsupported or not ethernet) return 0.
+static subint_config_t *
+ethernet_sw_interface_get_config (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 * flags,
+ u32 * unsupported) {
+ ethernet_main_t * em = &ethernet_main;
+ vnet_hw_interface_t * hi;
+ vnet_sw_interface_t * si;
+ main_intf_t * main_intf;
+ vlan_table_t * vlan_table;
+ qinq_table_t * qinq_table;
+ subint_config_t * subint = 0;
+
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ if (!hi || (hi->hw_class_index != ethernet_hw_interface_class.index)) {
+ *unsupported = 0;
+ goto done; // non-ethernet interface
+ }
+
+ // ensure there's an entry for the main intf (shouldn't really be necessary)
+ vec_validate (em->main_intfs, hi->hw_if_index);
+ main_intf = vec_elt_at_index (em->main_intfs, hi->hw_if_index);
+
+ // Locate the subint for the given ethernet config
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+
+ if (si->sub.eth.flags.default_sub) {
+ subint = &main_intf->default_subint;
+ *flags = SUBINT_CONFIG_MATCH_0_TAG |
+ SUBINT_CONFIG_MATCH_1_TAG |
+ SUBINT_CONFIG_MATCH_2_TAG |
+ SUBINT_CONFIG_MATCH_3_TAG;
+ } else if ((si->sub.eth.flags.no_tags) ||
+ (si->sub.eth.raw_flags == 0)) {
+ // if no flags are set then this is a main interface
+ // so treat as untagged
+ subint = &main_intf->untagged_subint;
+ *flags = SUBINT_CONFIG_MATCH_0_TAG;
+ } else {
+ // one or two tags
+ // first get the vlan table
+ if (si->sub.eth.flags.dot1ad) {
+ if (main_intf->dot1ad_vlans == 0) {
+ // Allocate a vlan table from the pool
+ pool_get(em->vlan_pool, vlan_table);
+ main_intf->dot1ad_vlans = vlan_table - em->vlan_pool;
+ } else {
+ // Get ptr to existing vlan table
+ vlan_table = vec_elt_at_index (em->vlan_pool, main_intf->dot1ad_vlans);
+ }
+ } else { // dot1q
+ if (main_intf->dot1q_vlans == 0) {
+ // Allocate a vlan table from the pool
+ pool_get(em->vlan_pool, vlan_table);
+ main_intf->dot1q_vlans = vlan_table - em->vlan_pool;
+ } else {
+ // Get ptr to existing vlan table
+ vlan_table = vec_elt_at_index (em->vlan_pool, main_intf->dot1q_vlans);
+ }
+ }
+
+ if (si->sub.eth.flags.one_tag) {
+ *flags = si->sub.eth.flags.exact_match ?
+ SUBINT_CONFIG_MATCH_1_TAG :
+ (SUBINT_CONFIG_MATCH_1_TAG |
+ SUBINT_CONFIG_MATCH_2_TAG |
+ SUBINT_CONFIG_MATCH_3_TAG);
+
+ if (si->sub.eth.flags.outer_vlan_id_any) {
+ // not implemented yet
+ *unsupported =1;
+ goto done;
+ } else {
+ // a single vlan, a common case
+ subint = &vlan_table->vlans[si->sub.eth.outer_vlan_id].single_tag_subint;
+ }
+
+ } else {
+ // Two tags
+ *flags = si->sub.eth.flags.exact_match ?
+ SUBINT_CONFIG_MATCH_2_TAG :
+ (SUBINT_CONFIG_MATCH_2_TAG |
+ SUBINT_CONFIG_MATCH_3_TAG);
+
+ if (si->sub.eth.flags.outer_vlan_id_any && si->sub.eth.flags.inner_vlan_id_any) {
+ // not implemented yet
+ *unsupported = 1;
+ goto done;
+ }
+
+ if (si->sub.eth.flags.inner_vlan_id_any) {
+ // a specific outer and "any" inner
+ // don't need a qinq table for this
+ subint = &vlan_table->vlans[si->sub.eth.outer_vlan_id].inner_any_subint;
+ if (si->sub.eth.flags.exact_match) {
+ *flags = SUBINT_CONFIG_MATCH_2_TAG;
+ } else {
+ *flags = SUBINT_CONFIG_MATCH_2_TAG |
+ SUBINT_CONFIG_MATCH_3_TAG;
+ }
+ } else {
+ // a specific outer + specifc innner vlan id, a common case
+
+ // get the qinq table
+ if (vlan_table->vlans[si->sub.eth.outer_vlan_id].qinqs == 0) {
+ // Allocate a qinq table from the pool
+ pool_get(em->qinq_pool, qinq_table);
+ vlan_table->vlans[si->sub.eth.outer_vlan_id].qinqs = qinq_table - em->qinq_pool;
+ } else {
+ // Get ptr to existing qinq table
+ qinq_table = vec_elt_at_index (em->qinq_pool, vlan_table->vlans[si->sub.eth.outer_vlan_id].qinqs);
+ }
+ subint = &qinq_table->vlans[si->sub.eth.inner_vlan_id].subint;
+ }
+ }
+ }
+
+ done:
+ return subint;
+}
+
+clib_error_t *
+ethernet_sw_interface_up_down (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 flags)
+{
+ subint_config_t * subint;
+ u32 dummy_flags;
+ u32 dummy_unsup;
+ clib_error_t * error = 0;
+
+ // Find the config for this subinterface
+ subint = ethernet_sw_interface_get_config (vnm, sw_if_index, &dummy_flags, &dummy_unsup);
+
+ if (subint == 0) {
+ // not implemented yet or not ethernet
+ goto done;
+ }
+
+ subint->sw_if_index =
+ ((flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? sw_if_index : ~0);
+
+ done:
+ return error;
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ethernet_sw_interface_up_down);
+
+
+// Set the L2/L3 mode for the subinterface
+void
+ethernet_sw_interface_set_l2_mode (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l2)
+{
+ subint_config_t *subint;
+ u32 dummy_flags;
+ u32 dummy_unsup;
+ int is_port;
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index);
+
+ is_port = !(sw->type == VNET_SW_INTERFACE_TYPE_SUB);
+
+ // Find the config for this subinterface
+ subint = ethernet_sw_interface_get_config (vnm, sw_if_index, &dummy_flags, &dummy_unsup);
+
+ if (subint == 0) {
+ // unimplemented or not ethernet
+ goto done;
+ }
+
+ // Double check that the config we found is for our interface (or the interface is down)
+ ASSERT ((subint->sw_if_index == sw_if_index) | (subint->sw_if_index == ~0));
+
+ if (l2) {
+ subint->flags |= SUBINT_CONFIG_L2;
+ if (is_port)
+ subint->flags |=
+ SUBINT_CONFIG_MATCH_0_TAG | SUBINT_CONFIG_MATCH_1_TAG
+ | SUBINT_CONFIG_MATCH_2_TAG | SUBINT_CONFIG_MATCH_3_TAG;
+ } else {
+ subint->flags &= ~SUBINT_CONFIG_L2;
+ if (is_port)
+ subint->flags &=
+ ~(SUBINT_CONFIG_MATCH_1_TAG | SUBINT_CONFIG_MATCH_2_TAG
+ | SUBINT_CONFIG_MATCH_3_TAG);
+ }
+
+ done:
+ return;
+}
+
+
+static clib_error_t *
+ethernet_sw_interface_add_del (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 is_create)
+{
+ clib_error_t * error = 0;
+ subint_config_t *subint;
+ u32 match_flags;
+ u32 unsupported;
+
+ // Find the config for this subinterface
+ subint = ethernet_sw_interface_get_config (vnm, sw_if_index, &match_flags, &unsupported);
+
+ if (subint == 0) {
+ // not implemented yet or not ethernet
+ if (unsupported) {
+ // this is the NYI case
+ error = clib_error_return (0, "not implemented yet");
+ }
+ goto done;
+ }
+
+ if (!is_create) {
+ subint->flags = 0;
+ return error;
+ }
+
+ // Initialize the subint
+ if (subint->flags & SUBINT_CONFIG_VALID) {
+ // Error vlan already in use
+ error = clib_error_return (0, "vlan is already in use");
+ } else {
+ // Note that config is L3 by defaulty
+ subint->flags = SUBINT_CONFIG_VALID | match_flags;
+ subint->sw_if_index = ~0; // because interfaces are initially down
+ }
+
+ done:
+ return error;
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ethernet_sw_interface_add_del);
+
+static char * ethernet_error_strings[] = {
+#define ethernet_error(n,c,s) s,
+#include "error.def"
+#undef ethernet_error
+};
+
+VLIB_REGISTER_NODE (ethernet_input_node) = {
+ .function = ethernet_input,
+ .name = "ethernet-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = ETHERNET_N_ERROR,
+ .error_strings = ethernet_error_strings,
+
+ .n_next_nodes = ETHERNET_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ETHERNET_INPUT_NEXT_##s] = n,
+ foreach_ethernet_input_next
+#undef _
+ },
+
+ .format_buffer = format_ethernet_header_with_length,
+ .format_trace = format_ethernet_input_trace,
+ .unformat_buffer = unformat_ethernet_header,
+};
+
+VLIB_REGISTER_NODE (ethernet_input_type_node,static) = {
+ .function = ethernet_input_type,
+ .name = "ethernet-input-type",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = ETHERNET_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ETHERNET_INPUT_NEXT_##s] = n,
+ foreach_ethernet_input_next
+#undef _
+ },
+};
+
+VLIB_REGISTER_NODE (ethernet_input_not_l2_node,static) = {
+ .function = ethernet_input_not_l2,
+ .name = "ethernet-input-not-l2",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = ETHERNET_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ETHERNET_INPUT_NEXT_##s] = n,
+ foreach_ethernet_input_next
+#undef _
+ },
+};
+
+void ethernet_set_rx_redirect (vnet_main_t * vnm,
+ vnet_hw_interface_t * hi,
+ u32 enable)
+{
+ // Insure all packets go to ethernet-input (i.e. untagged ipv4 packets
+ // don't go directly to ip4-input)
+ vnet_hw_interface_rx_redirect_to_node
+ (vnm, hi->hw_if_index, enable ? ethernet_input_node.index : ~0);
+}
+
+
+/*
+ * Initialization and registration for the next_by_ethernet structure
+ */
+
+clib_error_t * next_by_ethertype_init (next_by_ethertype_t * l3_next)
+{
+ l3_next->input_next_by_type = sparse_vec_new
+ (/* elt bytes */ sizeof (l3_next->input_next_by_type[0]),
+ /* bits in index */ BITS (((ethernet_header_t *) 0)->type));
+
+ vec_validate (l3_next->sparse_index_by_input_next_index, ETHERNET_INPUT_NEXT_DROP);
+ vec_validate (l3_next->sparse_index_by_input_next_index, ETHERNET_INPUT_NEXT_PUNT);
+ l3_next->sparse_index_by_input_next_index[ETHERNET_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ l3_next->sparse_index_by_input_next_index[ETHERNET_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+ return 0;
+}
+
+// Add an ethertype -> next index mapping to the structure
+clib_error_t * next_by_ethertype_register (next_by_ethertype_t * l3_next,
+ u32 ethertype,
+ u32 next_index)
+{
+ u32 i;
+ u16 * n;
+ ethernet_main_t * em = &ethernet_main;
+
+ /* Setup ethernet type -> next index sparse vector mapping. */
+ n = sparse_vec_validate (l3_next->input_next_by_type, ethertype);
+ n[0] = next_index;
+
+ /* Rebuild next index -> sparse index inverse mapping when sparse vector
+ is updated. */
+ vec_validate (l3_next->sparse_index_by_input_next_index, next_index);
+ for (i = 1; i < vec_len (l3_next->input_next_by_type); i++)
+ l3_next->sparse_index_by_input_next_index[l3_next->input_next_by_type[i]] = i;
+
+ // do not allow the cached next index's to be updated if L3
+ // redirect is enabled, as it will have overwritten them
+ if (!em->redirect_l3) {
+ // Cache common ethertypes directly
+ if (ethertype == ETHERNET_TYPE_IP4) {
+ l3_next->input_next_ip4 = next_index;
+ } else if (ethertype == ETHERNET_TYPE_IP6) {
+ l3_next->input_next_ip6 = next_index;
+ } else if (ethertype == ETHERNET_TYPE_MPLS_UNICAST) {
+ l3_next->input_next_mpls = next_index;
+ }
+ }
+ return 0;
+}
+
+
+static clib_error_t * ethernet_input_init (vlib_main_t * vm)
+{
+ ethernet_main_t * em = &ethernet_main;
+ __attribute__((unused)) vlan_table_t * invalid_vlan_table;
+ __attribute__((unused)) qinq_table_t * invalid_qinq_table;
+
+ ethernet_setup_node (vm, ethernet_input_node.index);
+ ethernet_setup_node (vm, ethernet_input_type_node.index);
+ ethernet_setup_node (vm, ethernet_input_not_l2_node.index);
+
+ next_by_ethertype_init (&em->l3_next);
+
+ // Initialize pools and vector for vlan parsing
+ vec_validate (em->main_intfs, 10); // 10 main interfaces
+ pool_alloc(em->vlan_pool, 10);
+ pool_alloc(em->qinq_pool, 1);
+
+ // The first vlan pool will always be reserved for an invalid table
+ pool_get(em->vlan_pool, invalid_vlan_table); // first id = 0
+ // The first qinq pool will always be reserved for an invalid table
+ pool_get(em->qinq_pool, invalid_qinq_table); // first id = 0
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ethernet_input_init);
+
+void
+ethernet_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index)
+{
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_type_info_t * ti;
+ u32 i;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, ethernet_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ ti = ethernet_get_type_info (em, type);
+ ti->node_index = node_index;
+ ti->next_index = vlib_node_add_next (vm,
+ ethernet_input_node.index,
+ node_index);
+ i = vlib_node_add_next (vm,
+ ethernet_input_type_node.index,
+ node_index);
+ ASSERT (i == ti->next_index);
+
+ i = vlib_node_add_next (vm,
+ ethernet_input_not_l2_node.index,
+ node_index);
+ ASSERT (i == ti->next_index);
+
+ // Add the L3 node for this ethertype to the next nodes structure
+ next_by_ethertype_register (&em->l3_next, type, ti->next_index);
+
+ // Call the registration functions for other nodes that want a mapping
+ l2bvi_register_input_type (vm, type, node_index);
+}
+
+void
+ethernet_register_l2_input (vlib_main_t * vm,
+ u32 node_index)
+{
+ ethernet_main_t * em = &ethernet_main;
+ u32 i;
+
+ em->l2_next = vlib_node_add_next (vm, ethernet_input_node.index, node_index);
+
+ /*
+ * Even if we never use these arcs, we have to align the next indices...
+ */
+ i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
+
+ ASSERT (i == em->l2_next);
+
+ i = vlib_node_add_next (vm,
+ ethernet_input_not_l2_node.index,
+ node_index);
+ ASSERT (i == em->l2_next);
+}
+
+// Register a next node for L3 redirect, and enable L3 redirect
+void
+ethernet_register_l3_redirect (vlib_main_t * vm,
+ u32 node_index)
+{
+ ethernet_main_t * em = &ethernet_main;
+ u32 i;
+
+ em->redirect_l3 = 1;
+ em->redirect_l3_next = vlib_node_add_next(vm,
+ ethernet_input_node.index,
+ node_index);
+ /*
+ * Change the cached next nodes to the redirect node
+ */
+ em->l3_next.input_next_ip4 = em->redirect_l3_next;
+ em->l3_next.input_next_ip6 = em->redirect_l3_next;
+ em->l3_next.input_next_mpls = em->redirect_l3_next;
+
+ /*
+ * Even if we never use these arcs, we have to align the next indices...
+ */
+ i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
+
+ ASSERT (i == em->redirect_l3_next);
+}
diff --git a/vnet/vnet/ethernet/packet.h b/vnet/vnet/ethernet/packet.h
new file mode 100644
index 00000000000..b5d1dcba931
--- /dev/null
+++ b/vnet/vnet/ethernet/packet.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet/packet.h: ethernet packet format.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ethernet_packet_h
+#define included_ethernet_packet_h
+
+typedef enum {
+#define ethernet_type(n,s) ETHERNET_TYPE_##s = n,
+#include <vnet/ethernet/types.def>
+#undef ethernet_type
+} ethernet_type_t;
+
+typedef struct {
+ /* Source/destination address. */
+ u8 dst_address[6];
+ u8 src_address[6];
+
+ /* Ethernet type. */
+ u16 type;
+} ethernet_header_t;
+
+#define ETHERNET_ADDRESS_UNICAST 0
+#define ETHERNET_ADDRESS_MULTICAST 1
+
+/* I/G bit: individual (unicast)/group (broadcast/multicast). */
+always_inline uword
+ethernet_address_cast (u8 * a)
+{ return (a[0] >> 0) & 1; }
+
+always_inline uword
+ethernet_address_is_locally_administered (u8 * a)
+{ return (a[0] >> 1) & 1; }
+
+always_inline void
+ethernet_address_set_locally_administered (u8 * a)
+{ a[0] |= 1 << 1; }
+
+/* For VLAN ethernet type. */
+typedef struct {
+ /* 3 bit priority, 1 bit CFI and 12 bit vlan id. */
+ u16 priority_cfi_and_id;
+
+#define ETHERNET_N_VLAN (1 << 12)
+
+ /* Inner ethernet type. */
+ u16 type;
+} ethernet_vlan_header_t;
+
+
+/* VLAN with ethertype first and vlan id second */
+typedef struct {
+ /* vlan type */
+ u16 type;
+
+ /* 3 bit priority, 1 bit CFI and 12 bit vlan id. */
+ u16 priority_cfi_and_id;
+} ethernet_vlan_header_tv_t;
+
+#endif /* included_ethernet_packet_h */
diff --git a/vnet/vnet/ethernet/pg.c b/vnet/vnet/ethernet/pg.c
new file mode 100644
index 00000000000..838688db451
--- /dev/null
+++ b/vnet/vnet/ethernet/pg.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ethernet_pg.c: packet generator ethernet interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+
+typedef struct {
+ pg_edit_t type;
+ pg_edit_t src_address;
+ pg_edit_t dst_address;
+} pg_ethernet_header_t;
+
+static inline void
+pg_ethernet_header_init (pg_ethernet_header_t * e)
+{
+ pg_edit_init (&e->type, ethernet_header_t, type);
+ pg_edit_init (&e->src_address, ethernet_header_t, src_address);
+ pg_edit_init (&e->dst_address, ethernet_header_t, dst_address);
+}
+
+typedef struct {
+ pg_edit_t type;
+ pg_edit_t id;
+ pg_edit_t cfi;
+ pg_edit_t priority;
+} pg_ethernet_vlan_header_t;
+
+static inline void
+pg_ethernet_vlan_header_init (pg_ethernet_vlan_header_t * v,
+ int vlan_index)
+{
+ ASSERT (vlan_index < ARRAY_LEN (((ethernet_max_header_t *) 0)->vlan));
+ pg_edit_init (&v->type, ethernet_max_header_t, vlan[vlan_index].type);
+
+ pg_edit_init_bitfield (&v->id, ethernet_max_header_t,
+ vlan[vlan_index].priority_cfi_and_id,
+ 0, 12);
+ pg_edit_init_bitfield (&v->cfi, ethernet_max_header_t,
+ vlan[vlan_index].priority_cfi_and_id,
+ 12, 1);
+ pg_edit_init_bitfield (&v->priority, ethernet_max_header_t,
+ vlan[vlan_index].priority_cfi_and_id,
+ 13, 3);
+}
+
+uword
+unformat_pg_ethernet_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_ethernet_header_t * e;
+ pg_ethernet_vlan_header_t * v;
+ pg_edit_t * ether_type_edit;
+ u32 n_vlan, error, group_index;
+
+ e = pg_create_edit_group (s, sizeof (e[0]), sizeof (ethernet_header_t),
+ &group_index);
+ pg_ethernet_header_init (e);
+ error = 1;
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_pg_edit,
+ unformat_ethernet_type_net_byte_order, &e->type,
+ unformat_pg_edit,
+ unformat_ethernet_address, &e->src_address,
+ unformat_pg_edit,
+ unformat_ethernet_address, &e->dst_address))
+ goto done;
+
+ n_vlan = 0;
+ while (unformat (input, "vlan"))
+ {
+ v = pg_add_edits (s, sizeof (v[0]), sizeof (ethernet_vlan_header_t),
+ group_index);
+ pg_ethernet_vlan_header_init (v, n_vlan);
+
+ if (! unformat_user (input, unformat_pg_edit,
+ unformat_pg_number, &v->id))
+ goto done;
+
+ if (! unformat (input, "priority %U", unformat_pg_edit,
+ unformat_pg_number, &v->priority))
+ pg_edit_set_fixed (&v->priority, 0);
+
+ if (! unformat (input, "cfi %U", unformat_pg_edit,
+ unformat_pg_number, &v->cfi))
+ pg_edit_set_fixed (&v->cfi, 0);
+
+ /* Too many vlans given. */
+ if (n_vlan >= 2)
+ goto done;
+
+ n_vlan++;
+ }
+
+ /* Address of e may have changed due to vlan edits being added */
+ e = pg_get_edit_group (s, group_index);
+ v = (void *) (e + 1);
+
+ /* Correct types for vlan packets. */
+ ether_type_edit = &e->type;
+ if (n_vlan > 0)
+ {
+ int i;
+
+ ether_type_edit = &v[n_vlan - 1].type;
+ pg_edit_copy_type_and_values (ether_type_edit, &e->type);
+ pg_edit_set_fixed (&e->type, ETHERNET_TYPE_VLAN);
+
+ for (i = 0; i < n_vlan - 1; i++)
+ pg_edit_set_fixed (&v[i].type, ETHERNET_TYPE_VLAN);
+ }
+
+ {
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_type_info_t * ti = 0;
+ pg_node_t * pg_node = 0;
+
+ if (ether_type_edit->type == PG_EDIT_FIXED)
+ {
+ u16 t = *(u16 *) ether_type_edit->values[PG_EDIT_LO];
+ ti = ethernet_get_type_info (em, clib_net_to_host_u16 (t));
+ if (ti && ti->node_index != ~0)
+ pg_node = pg_get_node (ti->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/ethernet/types.def b/vnet/vnet/ethernet/types.def
new file mode 100644
index 00000000000..37c97f62cdb
--- /dev/null
+++ b/vnet/vnet/ethernet/types.def
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Emacs editing mode -*-C-*- Ethernet types. */
+
+/*
+ * ethernet types
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* Types < 0x600 (1536) are LLC packet lengths. */
+ethernet_type (0x600, LLC_LENGTH)
+
+ethernet_type (0x600, XNS_IDP)
+ethernet_type (0x800, IP4)
+ethernet_type (0x806, ARP)
+ethernet_type (0x0BAD, VINES_IP)
+ethernet_type (0x0BAE, VINES_LOOPBACK)
+ethernet_type (0x0BAF, VINES_ECHO)
+ethernet_type (0x1984, TRAIN)
+ethernet_type (0x2000, CDP)
+ethernet_type (0x2001, CGMP)
+ethernet_type (0x2007, SRP_CONTROL)
+ethernet_type (0x2452, CENTRINO_PROMISC)
+ethernet_type (0x6000, DECNET)
+ethernet_type (0x6001, DECNET_DUMP_LOAD)
+ethernet_type (0x6002, DECNET_REMOTE_CONSOLE)
+ethernet_type (0x6003, DECNET_ROUTE)
+ethernet_type (0x6004, DEC_LAT)
+ethernet_type (0x6005, DEC_DIAGNOSTIC)
+ethernet_type (0x6006, DEC_CUSTOMER)
+ethernet_type (0x6007, DEC_SCA)
+ethernet_type (0x6558, TRANSPARENT_BRIDGING)
+ethernet_type (0x6559, RAW_FRAME_RELAY)
+ethernet_type (0x8035, REVERSE_ARP)
+ethernet_type (0x8038, DEC_LAN_BRIDGE)
+ethernet_type (0x803D, DEC_ETHERNET_ENCRYPTION)
+ethernet_type (0x803F, DEC_LAN_TRAFFIC_MONITOR)
+ethernet_type (0x8041, DEC_LAST)
+ethernet_type (0x809B, APPLETALK)
+ethernet_type (0x80D5, IBM_SNA)
+ethernet_type (0x80F3, APPLETALK_AARP)
+ethernet_type (0x80FF, WELLFLEET_COMPRESSION)
+ethernet_type (0x8100, VLAN)
+ethernet_type (0x8137, IPX)
+ethernet_type (0x814C, SNMP)
+ethernet_type (0x81FD, CABLETRON_ISMP)
+ethernet_type (0x81FF, CABLETRON_ISMP_TBFLOOD)
+ethernet_type (0x86DD, IP6)
+ethernet_type (0x86DF, ATOMIC)
+ethernet_type (0x876B, TCP_IP_COMPRESSION)
+ethernet_type (0x876C, IP_AUTONOMOUS_SYSTEMS)
+ethernet_type (0x876D, SECURE_DATA)
+ethernet_type (0x8808, MAC_CONTROL)
+ethernet_type (0x8809, SLOW_PROTOCOLS)
+ethernet_type (0x880B, PPP)
+ethernet_type (0x8847, MPLS_UNICAST)
+ethernet_type (0x8848, MPLS_MULTICAST)
+ethernet_type (0x8863, PPPOE_DISCOVERY)
+ethernet_type (0x8864, PPPOE_SESSION)
+ethernet_type (0x886D, INTEL_ANS)
+ethernet_type (0x886F, MICROSOFT_NLB_HEARTBEAT)
+ethernet_type (0x8881, CDMA_2000)
+ethernet_type (0x888e, 802_1X_AUTHENTICATION)
+ethernet_type (0x8892, PROFINET)
+ethernet_type (0x889a, HYPERSCSI)
+ethernet_type (0x88a2, AOE)
+ethernet_type (0x88a8, DOT1AD)
+ethernet_type (0x88AE, BRDWALK)
+ethernet_type (0x88B7, 802_OUI_EXTENDED)
+ethernet_type (0x88c7, 802_11I_PRE_AUTHENTICATION)
+ethernet_type (0x88cc, 802_1_LLDP)
+ethernet_type (0x894f, VPATH_3)
+ethernet_type (0x9000, LOOPBACK)
+ethernet_type (0x9021, RTNET_MAC)
+ethernet_type (0x9022, RTNET_CONFIG)
+ethernet_type (0x9100, VLAN_9100)
+ethernet_type (0x9200, VLAN_9200)
+ethernet_type (0x9999, PGLAN)
+ethernet_type (0xFEFE, SRP_ISIS)
+ethernet_type (0xFFFF, RESERVED)
diff --git a/vnet/vnet/flow/flow_report.c b/vnet/vnet/flow/flow_report.c
new file mode 100644
index 00000000000..9c0cbb6dd7d
--- /dev/null
+++ b/vnet/vnet/flow/flow_report.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * flow_report.c
+ */
+#include <vnet/flow/flow_report.h>
+#include <vnet/api_errno.h>
+
+int send_template_packet (flow_report_main_t *frm,
+ flow_report_t *fr,
+ u32 * buffer_indexp)
+{
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip4_ipfix_template_packet_t * tp;
+ ipfix_message_header_t * h;
+ ip4_header_t * ip;
+ udp_header_t * udp;
+ vlib_main_t * vm = frm->vlib_main;
+
+ ASSERT (buffer_indexp);
+
+ if (fr->update_rewrite || fr->rewrite == 0)
+ {
+ if (frm->ipfix_collector.as_u32 == 0
+ || frm->src_address.as_u32 == 0)
+ {
+ clib_warning ("no collector: disabling flow collector process");
+ vlib_node_set_state (frm->vlib_main, flow_report_process_node.index,
+ VLIB_NODE_STATE_DISABLED);
+ return -1;
+ }
+ vec_free (fr->rewrite);
+ fr->update_rewrite = 1;
+ }
+
+ if (fr->update_rewrite)
+ {
+ fr->rewrite = fr->rewrite_callback (frm, fr,
+ &frm->ipfix_collector,
+ &frm->src_address);
+ fr->update_rewrite = 0;
+ }
+
+ if (vlib_buffer_alloc (vm, &bi0, 1) != 1)
+ return -1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ ASSERT (vec_len (fr->rewrite) < VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES);
+
+ memcpy (b0->data, fr->rewrite, vec_len (fr->rewrite));
+ b0->current_data = 0;
+ b0->current_length = vec_len (fr->rewrite);
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+ /* $$$ for now, look up in fib-0. Later: arbitrary TX fib */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
+
+ tp = vlib_buffer_get_current (b0);
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+ h = (ipfix_message_header_t *)(udp+1);
+
+ /* FIXUP: message header export_time */
+ h->export_time = (u32)
+ (((f64)frm->unix_time_0) +
+ (vlib_time_now(frm->vlib_main) - frm->vlib_time_0));
+ h->export_time = clib_host_to_net_u32(h->export_time);
+
+ /* FIXUP: message header sequence_number. Templates do not increase it */
+ h->sequence_number = clib_host_to_net_u32(fr->sequence_number);
+
+ /* FIXUP: udp length */
+ udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip));
+
+ *buffer_indexp = bi0;
+ return 0;
+}
+
+static uword
+flow_report_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ flow_report_main_t * frm = &flow_report_main;
+ flow_report_t * fr;
+ u32 ip4_lookup_node_index;
+ vlib_node_t * ip4_lookup_node;
+ vlib_frame_t * nf = 0;
+ u32 template_bi;
+ u32 * to_next;
+ int send_template;
+ f64 now;
+ int rv;
+ uword event_type;
+ uword *event_data = 0;
+
+ /* Wait for Godot... */
+ vlib_process_wait_for_event_or_clock (vm, 1e9);
+ event_type = vlib_process_get_events (vm, &event_data);
+ if (event_type != 1)
+ clib_warning ("bogus kickoff event received, %d", event_type);
+ vec_reset_length (event_data);
+
+ /* Enqueue pkts to ip4-lookup */
+ ip4_lookup_node = vlib_get_node_by_name (vm, (u8 *) "ip4-lookup");
+ ip4_lookup_node_index = ip4_lookup_node->index;
+
+ while (1)
+ {
+ vlib_process_suspend (vm, 5.0);
+
+ vec_foreach (fr, frm->reports)
+ {
+ now = vlib_time_now (vm);
+
+ /* Need to send a template packet? */
+ send_template = now > (fr->last_template_sent + 20.0);
+ send_template += fr->last_template_sent == 0;
+ template_bi = ~0;
+ rv = 0;
+
+ if (send_template)
+ rv = send_template_packet (frm, fr, &template_bi);
+
+ if (rv < 0)
+ continue;
+
+ nf = vlib_get_frame_to_node (vm, ip4_lookup_node_index);
+ nf->n_vectors = 0;
+ to_next = vlib_frame_vector_args (nf);
+
+ if (template_bi != ~0)
+ {
+ to_next[0] = template_bi;
+ to_next++;
+ nf->n_vectors++;
+ }
+
+ nf = fr->flow_data_callback (frm, fr,
+ nf, to_next, ip4_lookup_node_index);
+ if (nf)
+ vlib_put_frame_to_node (vm, ip4_lookup_node_index, nf);
+ }
+ }
+
+ return 0; /* not so much */
+}
+
+VLIB_REGISTER_NODE (flow_report_process_node) = {
+ .function = flow_report_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "flow-report-process",
+};
+
+int vnet_flow_report_add_del (flow_report_main_t *frm,
+ vnet_flow_report_add_del_args_t *a)
+{
+ int i;
+ int found_index = ~0;
+ flow_report_t *fr;
+
+ for (i = 0; i < vec_len(frm->reports); i++)
+ {
+ fr = vec_elt_at_index (frm->reports, i);
+ if (fr->opaque == a->opaque
+ && fr->rewrite_callback == a->rewrite_callback
+ && fr->flow_data_callback == a->flow_data_callback)
+ {
+ found_index = i;
+ break;
+ }
+ }
+
+ if (a->is_add == 0)
+ {
+ if (found_index != ~0)
+ {
+ vec_delete (frm->reports, 1, found_index);
+ return 0;
+ }
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+ }
+
+ vec_add2 (frm->reports, fr, 1);
+
+ fr->sequence_number = 0;
+ fr->domain_id = a->domain_id;
+ fr->update_rewrite = 1;
+ fr->opaque = a->opaque;
+ fr->rewrite_callback = a->rewrite_callback;
+ fr->flow_data_callback = a->flow_data_callback;
+
+ return 0;
+}
+
+static clib_error_t *
+set_ipfix_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ flow_report_main_t * frm = &flow_report_main;
+ ip4_address_t collector, src;
+
+ collector.as_u32 = 0;
+ src.as_u32 = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "collector %U", unformat_ip4_address, &collector))
+ ;
+ else if (unformat (input, "src %U", unformat_ip4_address, &src))
+ ;
+ else
+ break;
+ }
+
+ if (collector.as_u32 == 0)
+ return clib_error_return (0, "collector address required");
+
+ if (src.as_u32 == 0)
+ return clib_error_return (0, "src address required");
+
+ frm->ipfix_collector.as_u32 = collector.as_u32;
+ frm->src_address.as_u32 = src.as_u32;
+
+ vlib_cli_output (vm, "Collector %U, src address %U",
+ format_ip4_address, &frm->ipfix_collector,
+ format_ip4_address, &frm->src_address);
+
+ /* Turn on the flow reporting process */
+ vlib_process_signal_event (vm, flow_report_process_node.index,
+ 1, 0);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ipfix_command, static) = {
+ .path = "set ipfix",
+ .short_help = "set ipfix collector <ip4-address> src <ip4-address>",
+ .function = set_ipfix_command_fn,
+};
+
+static clib_error_t *
+flow_report_init (vlib_main_t *vm)
+{
+ flow_report_main_t * frm = &flow_report_main;
+
+ frm->vlib_main = vm;
+ frm->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (flow_report_init)
diff --git a/vnet/vnet/flow/flow_report.h b/vnet/vnet/flow/flow_report.h
new file mode 100644
index 00000000000..14185bf6798
--- /dev/null
+++ b/vnet/vnet/flow/flow_report.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_vnet_flow_report_h__
+#define __included_vnet_flow_report_h__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/ip/udp.h>
+#include <vlib/cli.h>
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+#include <vnet/flow/ipfix_packet.h>
+
+/* Used to build the rewrite */
+typedef struct {
+ ip4_header_t ip4;
+ udp_header_t udp;
+ ipfix_template_packet_t ipfix;
+} ip4_ipfix_template_packet_t;
+
+struct flow_report_main;
+struct flow_report;
+
+typedef u8 * (vnet_flow_rewrite_callback_t)(struct flow_report_main *,
+ struct flow_report *,
+ ip4_address_t *,
+ ip4_address_t *);
+
+typedef vlib_frame_t * (vnet_flow_data_callback_t) (struct flow_report_main *,
+ struct flow_report *,
+ vlib_frame_t *, u32 *,
+ u32);
+typedef struct flow_report {
+ /* ipfix rewrite, set by callback */
+ u8 * rewrite;
+ u32 sequence_number;
+ u32 domain_id;
+ f64 last_template_sent;
+ int update_rewrite;
+
+ /* Bitmap of fields to send */
+ uword * fields_to_send;
+
+ /* Opaque data */
+ void * opaque;
+
+ /* build-the-rewrite callback */
+ vnet_flow_rewrite_callback_t *rewrite_callback;
+
+ /* Send-flow-data callback */
+ vnet_flow_data_callback_t *flow_data_callback;
+} flow_report_t;
+
+typedef struct flow_report_main {
+ flow_report_t * reports;
+
+ /* ipfix collector, our ip address */
+ ip4_address_t ipfix_collector;
+ ip4_address_t src_address;
+
+ /* time scale transform. Joy. */
+ u32 unix_time_0;
+ f64 vlib_time_0;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} flow_report_main_t;
+
+flow_report_main_t flow_report_main;
+
+vlib_node_registration_t flow_report_process_node;
+
+int vnet_flow_report_enable_disable (u32 sw_if_index, u32 table_index,
+ int enable_disable);
+typedef struct {
+ vnet_flow_data_callback_t *flow_data_callback;
+ vnet_flow_rewrite_callback_t *rewrite_callback;
+ void * opaque;
+ int is_add;
+ u32 domain_id;
+} vnet_flow_report_add_del_args_t;
+
+int vnet_flow_report_add_del (flow_report_main_t *frm,
+ vnet_flow_report_add_del_args_t *a);
+
+#endif /* __included_vnet_flow_report_h__ */
diff --git a/vnet/vnet/flow/flow_report_sample.c b/vnet/vnet/flow/flow_report_sample.c
new file mode 100644
index 00000000000..eb2fbfced1d
--- /dev/null
+++ b/vnet/vnet/flow/flow_report_sample.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/flow/flow_report.h>
+#include <vnet/flow/flow_report_sample.h>
+#include <vnet/api_errno.h>
+
+typedef struct {
+ u32 classify_table_index;
+} flow_report_sample_main_t;
+
+flow_report_sample_main_t flow_report_sample_main;
+
+static u8 * template_rewrite (flow_report_main_t * frm,
+ flow_report_t * fr,
+ ip4_address_t * collector_address,
+ ip4_address_t * src_address)
+{
+ vnet_classify_table_t * tblp;
+ vnet_classify_main_t * vcm = &vnet_classify_main;
+ flow_report_sample_main_t *fsm =
+ (flow_report_sample_main_t *) fr->opaque;
+ ip4_header_t * ip;
+ udp_header_t * udp;
+ ipfix_message_header_t * h;
+ ipfix_set_header_t * s;
+ ipfix_template_header_t * t;
+ ipfix_field_specifier_t * f;
+ ipfix_field_specifier_t * first_field;
+ u8 * rewrite = 0;
+ ip4_ipfix_template_packet_t * tp;
+ i32 l3_offset = -2; /* sizeof (ethernet_header_t) - sizeof (u32x4) */
+ u32 field_count = 0;
+ u32 field_index = 0;
+
+ tblp = pool_elt_at_index (vcm->tables, fsm->classify_table_index);
+
+ /*
+ * Mumble, assumes that we're not classifying on L2 or first 2 octets
+ * of L3..
+ */
+
+ ip = (ip4_header_t *)(((u8 *)(tblp->mask)) + l3_offset);
+ udp = (udp_header_t *)(ip+1);
+
+ /* Determine field count */
+#define _(field,mask,item,length) \
+ if ((field) == (mask)) \
+ { \
+ field_count++; \
+ \
+ fr->fields_to_send = clib_bitmap_set (fr->fields_to_send, \
+ field_index, 1); \
+ } \
+ field_index++;
+
+ foreach_ipfix_field;
+#undef _
+ /* Add packetTotalCount manually */
+ field_count += 1;
+
+ /* $$$ enterprise fields, at some later date */
+
+ /* allocate rewrite space */
+ vec_validate_aligned (rewrite,
+ sizeof (ip4_ipfix_template_packet_t)
+ + field_count * sizeof (ipfix_field_specifier_t) - 1,
+ CLIB_CACHE_LINE_BYTES);
+
+ tp = (ip4_ipfix_template_packet_t *) rewrite;
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+ h = (ipfix_message_header_t *)(udp+1);
+ s = (ipfix_set_header_t *)(h+1);
+ t = (ipfix_template_header_t *)(s+1);
+ first_field = f = (ipfix_field_specifier_t *)(t+1);
+
+ ip->ip_version_and_header_length = 0x45;
+ ip->ttl = 254;
+ ip->protocol = IP_PROTOCOL_UDP;
+ ip->src_address.as_u32 = src_address->as_u32;
+ ip->dst_address.as_u32 = collector_address->as_u32;
+ udp->src_port = clib_host_to_net_u16 (4739 /* $$FIXME */);
+ udp->dst_port = clib_host_to_net_u16 (UDP_DST_PORT_ipfix);
+ udp->length = clib_host_to_net_u16 (vec_len(rewrite) - sizeof (*ip));
+
+ /* FIXUP: message header export_time */
+ /* FIXUP: message header sequence_number */
+ h->domain_id = clib_host_to_net_u32 (fr->domain_id);
+
+ /* Take another trip through the mask and build the template */
+ ip = (ip4_header_t *)(((u8 *)(tblp->mask)) + l3_offset);
+ udp = (udp_header_t *)(ip+1);
+#define _(field,mask,item,length) \
+ if ((field) == (mask)) \
+ { \
+ f->e_id_length = ipfix_e_id_length (0 /* enterprise */, \
+ item, length); \
+ f++; \
+ }
+ foreach_ipfix_field;
+#undef _
+
+ /* Add packetTotalCount manually */
+ f->e_id_length = ipfix_e_id_length (0 /* enterprise */, packetTotalCount, 8);
+ f++;
+
+ /* Back to the template packet... */
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+
+ ASSERT (f - first_field);
+ /* Field count in this template */
+ t->id_count = ipfix_id_count (256 /* template_id */, f - first_field);
+
+ /* set length in octets*/
+ s->set_id_length = ipfix_set_id_length (2 /* set_id */, (u8 *) f - (u8 *)s);
+
+ /* message length in octets */
+ h->version_length = version_length ((u8 *)f - (u8 *)h);
+
+ ip->length = clib_host_to_net_u16 ((u8 *)f - (u8 *)ip);
+ ip->checksum = ip4_header_checksum (ip);
+
+ return rewrite;
+}
+
+static vlib_frame_t * send_flows (flow_report_main_t * frm,
+ flow_report_t * fr,
+ vlib_frame_t * f, u32 * to_next,
+ u32 node_index)
+{
+ vnet_classify_main_t * vcm = &vnet_classify_main;
+ flow_report_sample_main_t * fsm =
+ (flow_report_sample_main_t *) fr->opaque;
+ vnet_classify_table_t * t =
+ pool_elt_at_index (vcm->tables, fsm->classify_table_index);
+ vnet_classify_bucket_t * b;
+ vnet_classify_entry_t * v, * save_v;
+ vlib_buffer_t *b0 = 0;
+ u32 next_offset = 0;
+ u32 bi0 = ~0;
+ int i, j, k;
+ ip4_ipfix_template_packet_t * tp;
+ ipfix_message_header_t * h;
+ ipfix_set_header_t * s = 0;
+ ip4_header_t * ip;
+ udp_header_t * udp;
+ int field_index;
+ ip4_header_t * match;
+ u32 records_this_buffer;
+ u16 new_l0, old_l0;
+ ip_csum_t sum0;
+ vlib_main_t * vm = frm->vlib_main;
+
+ while (__sync_lock_test_and_set (t->writer_lock, 1))
+ ;
+
+ for (i = 0; i < t->nbuckets; i++)
+ {
+ b = &t->buckets [i];
+ if (b->offset == 0)
+ continue;
+
+ save_v = vnet_classify_get_entry (t, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < t->entries_per_page; k++)
+ {
+ v = vnet_classify_entry_at_index
+ (t, save_v, j*t->entries_per_page + k);
+
+ if (vnet_classify_entry_is_free (v))
+ continue;
+
+ /* OK, we have something to send... */
+ if (PREDICT_FALSE (b0 == 0))
+ {
+ if (vlib_buffer_alloc (vm, &bi0, 1) != 1)
+ goto flush;
+ b0 = vlib_get_buffer (vm, bi0);
+
+ memcpy (b0->data, fr->rewrite, vec_len (fr->rewrite));
+ b0->current_data = 0;
+ b0->current_length = vec_len (fr->rewrite);
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
+ /* $$$ for now, look up in fib-0. Later: arbitrary TX fib */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
+
+ tp = vlib_buffer_get_current (b0);
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+ h = (ipfix_message_header_t *)(udp+1);
+ s = (ipfix_set_header_t *)(h+1);
+
+ /* FIXUP: message header export_time */
+ h->export_time = (u32)
+ (((f64)frm->unix_time_0) +
+ (vlib_time_now(frm->vlib_main) - frm->vlib_time_0));
+ h->export_time = clib_host_to_net_u32(h->export_time);
+
+ /* FIXUP: message header sequence_number */
+ h->sequence_number = fr->sequence_number++;
+ h->sequence_number = clib_host_to_net_u32 (h->sequence_number);
+ next_offset = (u32) (((u8 *)(s+1)) - (u8 *)tp);
+ records_this_buffer = 0;
+ }
+
+ field_index = 0;
+ match = (ip4_header_t *) (((u8 *)v->key) - 2);
+ ip = match;
+ udp = (udp_header_t * )(ip+1);
+
+#define _(field,mask,item,length) \
+ if (clib_bitmap_get (fr->fields_to_send, field_index)) \
+ { \
+ memcpy (b0->data + next_offset, &field, \
+ length); \
+ next_offset += length; \
+ } \
+ field_index++;
+ foreach_ipfix_field;
+#undef _
+
+ /* Add packetTotalCount manually */
+ {
+ u64 packets = clib_host_to_net_u64 (v->hits);
+ memcpy (b0->data + next_offset, &packets, sizeof (packets));
+ next_offset += sizeof (packets);
+ }
+ records_this_buffer++;
+
+ if (next_offset > 1450)
+ {
+ s->set_id_length = ipfix_set_id_length (256 /* template ID*/,
+ next_offset -
+ (sizeof (*ip) + sizeof (*udp) +
+ sizeof (*h)));
+ b0->current_length = next_offset;
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ tp = vlib_buffer_get_current (b0);
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+
+ sum0 = ip->checksum;
+ old_l0 = clib_net_to_host_u16 (ip->length);
+ new_l0 =
+ clib_host_to_net_u16 ((u16)next_offset);
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+
+ ip->checksum = ip_csum_fold (sum0);
+ ip->length = new_l0;
+ udp->length =
+ clib_host_to_net_u16 (b0->current_length - sizeof (ip));
+
+ to_next[0] = bi0;
+ f->n_vectors++;
+ to_next++;
+
+ if (f->n_vectors == VLIB_FRAME_SIZE)
+ {
+ vlib_put_frame_to_node (vm, node_index, f);
+ f = vlib_get_frame_to_node (vm, node_index);
+ f->n_vectors = 0;
+ to_next = vlib_frame_vector_args (f);
+ }
+ b0 = 0;
+ bi0 = ~0;
+ }
+ }
+ }
+ }
+
+ flush:
+ if (b0)
+ {
+ s->set_id_length = ipfix_set_id_length (256 /* template ID*/,
+ next_offset -
+ (sizeof (*ip) + sizeof (*udp) +
+ sizeof (*h)));
+ b0->current_length = next_offset;
+ b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ tp = vlib_buffer_get_current (b0);
+ ip = (ip4_header_t *) &tp->ip4;
+ udp = (udp_header_t *) (ip+1);
+
+ sum0 = ip->checksum;
+ old_l0 = ip->length;
+ new_l0 = clib_host_to_net_u16 ((u16)next_offset);
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+
+ ip->checksum = ip_csum_fold (sum0);
+ ip->length = new_l0;
+ udp->length = clib_host_to_net_u16 (b0->current_length - sizeof (*ip));
+
+ ASSERT (ip->checksum == ip4_header_checksum (ip));
+
+ to_next[0] = bi0;
+ f->n_vectors++;
+
+ b0 = 0;
+ bi0 = ~0;
+ }
+
+ *(t->writer_lock) = 0;
+ return f;
+}
+
+
+static clib_error_t *
+flow_sample_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ flow_report_sample_main_t *fsm = &flow_report_sample_main;
+ flow_report_main_t *frm = &flow_report_main;
+ vnet_flow_report_add_del_args_t args;
+ int rv;
+ int is_add = 1;
+ u32 domain_id = 0;
+
+ domain_id = 0;
+ fsm->classify_table_index = ~0;
+ memset (&args, 0, sizeof (args));
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table %d", &fsm->classify_table_index))
+ ;
+ else if (unformat (input, "domain %d", &domain_id))
+ ;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (fsm->classify_table_index == ~0)
+ return clib_error_return (0, "classifier table not specified");
+
+ args.opaque = (void *) fsm;
+ args.rewrite_callback = template_rewrite;
+ args.flow_data_callback = send_flows;
+ args.is_add = is_add;
+ args.domain_id = domain_id;
+
+ rv = vnet_flow_report_add_del (frm, &args);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "registration not found...");
+ default:
+ return clib_error_return (0, "vnet_flow_report_add_del returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (flow_sample_command, static) = {
+ .path = "flow sample",
+ .short_help = "flow sample",
+ .function = flow_sample_command_fn,
+};
+
+static clib_error_t *
+flow_report_sample_init (vlib_main_t *vm)
+{
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, flow_report_init)))
+ return error;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (flow_report_sample_init);
diff --git a/vnet/vnet/flow/flow_report_sample.h b/vnet/vnet/flow/flow_report_sample.h
new file mode 100644
index 00000000000..945beba1897
--- /dev/null
+++ b/vnet/vnet/flow/flow_report_sample.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_flow_report_sample_h__
+#define __included_flow_report_sample_h__
+
+/* Note: add +2 to udp (src,dst) port enum values to get TCP values */
+#define foreach_ipfix_field \
+_(ip->src_address.as_u32, 0xffffffff, sourceIPv4Address, 4) \
+_(ip->dst_address.as_u32, 0xffffffff, destinationIPv4Address, 4) \
+_(ip->protocol, 0xFF, protocolIdentifier, 1) \
+_(udp->src_port, 0xFFFF, udpSourcePort, 2) \
+_(udp->dst_port, 0xFFFF, udpDestinationPort, 2)
+
+#endif /* __included_flow_report_sample_h__ */
diff --git a/vnet/vnet/flow/ipfix_info_elements.h b/vnet/vnet/flow/ipfix_info_elements.h
new file mode 100644
index 00000000000..5d7e935dabb
--- /dev/null
+++ b/vnet/vnet/flow/ipfix_info_elements.h
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ipfix_info_elements_h__
+#define __included_ipfix_info_elements_h__
+
+#define foreach_ipfix_info_element_t \
+_(octetDeltaCount, 1, u64) \
+_(packetDeltaCount, 2, u64) \
+_(deltaFlowCount, 3, u64) \
+_(protocolIdentifier, 4, u8) \
+_(ipClassOfService, 5, u8) \
+_(tcpControlBits, 6, u16) \
+_(sourceTransportPort, 7, u16) \
+_(sourceIPv4Address, 8, ip4_address_t) \
+_(sourceIPv4PrefixLength, 9, u8) \
+_(ingressInterface, 10, u32) \
+_(destinationTransportPort, 11, u16) \
+_(destinationIPv4Address, 12, ip4_address_t) \
+_(destinationIPv4PrefixLength, 13, u8) \
+_(egressInterface, 14, u32) \
+_(ipNextHopIPv4Address, 15, ip4_address_t) \
+_(bgpSourceAsNumber, 16, u32) \
+_(bgpDestinationAsNumber, 17, u32) \
+_(bgpNextHopIPv4Address, 18, ip4_address_t) \
+_(postMCastPacketDeltaCount, 19, u64) \
+_(postMCastOctetDeltaCount, 20, u64) \
+_(flowEndSysUpTime, 21, u32) \
+_(flowStartSysUpTime, 22, u32) \
+_(postOctetDeltaCount, 23, u64) \
+_(postPacketDeltaCount, 24, u64) \
+_(minimumIpTotalLength, 25, u64) \
+_(maximumIpTotalLength, 26, u64) \
+_(sourceIPv6Address, 27, ip6_address_t) \
+_(destinationIPv6Address, 28, ip6_address_t) \
+_(sourceIPv6PrefixLength, 29, u8) \
+_(destinationIPv6PrefixLength, 30, u8) \
+_(flowLabelIPv6, 31, u32) \
+_(icmpTypeCodeIPv4, 32, u16) \
+_(igmpType, 33, u8) \
+_(samplingInterval, 34, u32) \
+_(samplingAlgorithm, 35, u8) \
+_(flowActiveTimeout, 36, u16) \
+_(flowIdleTimeout, 37, u16) \
+_(engineType, 38, u8) \
+_(engineId, 39, u8) \
+_(exportedOctetTotalCount, 40, u64) \
+_(exportedMessageTotalCount, 41, u64) \
+_(exportedFlowRecordTotalCount, 42, u64) \
+_(ipv4RouterSc, 43, ip4_address_t) \
+_(sourceIPv4Prefix, 44, ip4_address_t) \
+_(destinationIPv4Prefix, 45, ip4_address_t) \
+_(mplsTopLabelType, 46, u8) \
+_(mplsTopLabelIPv4Address, 47, ip4_address_t) \
+_(samplerId, 48, u8) \
+_(samplerMode, 49, u8) \
+_(samplerRandomInterval, 50, u32) \
+_(classId, 51, u8) \
+_(minimumTTL, 52, u8) \
+_(maximumTTL, 53, u8) \
+_(fragmentIdentification, 54, u32) \
+_(postIpClassOfService, 55, u8) \
+_(sourceMacAddress, 56, macAddress) \
+_(postDestinationMacAddress, 57, macAddress) \
+_(vlanId, 58, u16) \
+_(postVlanId, 59, u16) \
+_(ipVersion, 60, u8) \
+_(flowDirection, 61, u8) \
+_(ipNextHopIPv6Address, 62, ip6_address_t) \
+_(bgpNextHopIPv6Address, 63, ip6_address_t) \
+_(ipv6ExtensionHeaders, 64, u32) \
+_(mplsTopLabelStackSection, 70, octetArray) \
+_(mplsLabelStackSection2, 71, octetArray) \
+_(mplsLabelStackSection3, 72, octetArray) \
+_(mplsLabelStackSection4, 73, octetArray) \
+_(mplsLabelStackSection5, 74, octetArray) \
+_(mplsLabelStackSection6, 75, octetArray) \
+_(mplsLabelStackSection7, 76, octetArray) \
+_(mplsLabelStackSection8, 77, octetArray) \
+_(mplsLabelStackSection9, 78, octetArray) \
+_(mplsLabelStackSection10, 79, octetArray) \
+_(destinationMacAddress, 80, macAddress) \
+_(postSourceMacAddress, 81, macAddress) \
+_(interfaceName, 82, string) \
+_(interfaceDescription, 83, string) \
+_(samplerName, 84, string) \
+_(octetTotalCount, 85, u64) \
+_(packetTotalCount, 86, u64) \
+_(flagsAndSamplerId, 87, u32) \
+_(fragmentOffset, 88, u16) \
+_(forwardingStatus, 89, u32) \
+_(mplsVpnRouteDistinguisher, 90, octetArray) \
+_(mplsTopLabelPrefixLength, 91, u8) \
+_(srcTrafficIndex, 92, u32) \
+_(dstTrafficIndex, 93, u32) \
+_(applicationDescription, 94, string) \
+_(applicationId, 95, octetArray) \
+_(applicationName, 96, string) \
+_(Assigned, 97, for NetFlow v9 compatibility ) \
+_(postIpDiffServCodePoint, 98, u8) \
+_(multicastReplicationFactor, 99, u32) \
+_(className, 100, string) \
+_(classificationEngineId, 101, u8) \
+_(layer2packetSectionOffset, 102, u16) \
+_(layer2packetSectionSize, 103, u16) \
+_(layer2packetSectionData, 104, octetArray) \
+_(bgpNextAdjacentAsNumber, 128, u32) \
+_(bgpPrevAdjacentAsNumber, 129, u32) \
+_(exporterIPv4Address, 130, ip4_address_t) \
+_(exporterIPv6Address, 131, ip6_address_t) \
+_(droppedOctetDeltaCount, 132, u64) \
+_(droppedPacketDeltaCount, 133, u64) \
+_(droppedOctetTotalCount, 134, u64) \
+_(droppedPacketTotalCount, 135, u64) \
+_(flowEndReason, 136, u8) \
+_(commonPropertiesId, 137, u64) \
+_(observationPointId, 138, u64) \
+_(icmpTypeCodeIPv6, 139, u16) \
+_(mplsTopLabelIPv6Address, 140, ip6_address_t) \
+_(lineCardId, 141, u32) \
+_(portId, 142, u32) \
+_(meteringProcessId, 143, u32) \
+_(exportingProcessId, 144, u32) \
+_(templateId, 145, u16) \
+_(wlanChannelId, 146, u8) \
+_(wlanSSID, 147, string) \
+_(flowId, 148, u64) \
+_(observationDomainId, 149, u32) \
+_(flowStartSeconds, 150, dateTimeSeconds) \
+_(flowEndSeconds, 151, dateTimeSeconds) \
+_(flowStartMilliseconds, 152, dateTimeMilliseconds) \
+_(flowEndMilliseconds, 153, dateTimeMilliseconds) \
+_(flowStartMicroseconds, 154, dateTimeMicroseconds) \
+_(flowEndMicroseconds, 155, dateTimeMicroseconds) \
+_(flowStartNanoseconds, 156, dateTimeNanoseconds) \
+_(flowEndNanoseconds, 157, dateTimeNanoseconds) \
+_(flowStartDeltaMicroseconds, 158, u32) \
+_(flowEndDeltaMicroseconds, 159, u32) \
+_(systemInitTimeMilliseconds, 160, dateTimeMilliseconds) \
+_(flowDurationMilliseconds, 161, u32) \
+_(flowDurationMicroseconds, 162, u32) \
+_(observedFlowTotalCount, 163, u64) \
+_(ignoredPacketTotalCount, 164, u64) \
+_(ignoredOctetTotalCount, 165, u64) \
+_(notSentFlowTotalCount, 166, u64) \
+_(notSentPacketTotalCount, 167, u64) \
+_(notSentOctetTotalCount, 168, u64) \
+_(destinationIPv6Prefix, 169, ip6_address_t) \
+_(sourceIPv6Prefix, 170, ip6_address_t) \
+_(postOctetTotalCount, 171, u64) \
+_(postPacketTotalCount, 172, u64) \
+_(flowKeyIndicator, 173, u64) \
+_(postMCastPacketTotalCount, 174, u64) \
+_(postMCastOctetTotalCount, 175, u64) \
+_(icmpTypeIPv4, 176, u8) \
+_(icmpCodeIPv4, 177, u8) \
+_(icmpTypeIPv6, 178, u8) \
+_(icmpCodeIPv6, 179, u8) \
+_(udpSourcePort, 180, u16) \
+_(udpDestinationPort, 181, u16) \
+_(tcpSourcePort, 182, u16) \
+_(tcpDestinationPort, 183, u16) \
+_(tcpSequenceNumber, 184, u32) \
+_(tcpAcknowledgementNumber, 185, u32) \
+_(tcpWindowSize, 186, u16) \
+_(tcpUrgentPointer, 187, u16) \
+_(tcpHeaderLength, 188, u8) \
+_(ipHeaderLength, 189, u8) \
+_(totalLengthIPv4, 190, u16) \
+_(payloadLengthIPv6, 191, u16) \
+_(ipTTL, 192, u8) \
+_(nextHeaderIPv6, 193, u8) \
+_(mplsPayloadLength, 194, u32) \
+_(ipDiffServCodePoint, 195, u8) \
+_(ipPrecedence, 196, u8) \
+_(fragmentFlags, 197, u8) \
+_(octetDeltaSumOfSquares, 198, u64) \
+_(octetTotalSumOfSquares, 199, u64) \
+_(mplsTopLabelTTL, 200, u8) \
+_(mplsLabelStackLength, 201, u32) \
+_(mplsLabelStackDepth, 202, u32) \
+_(mplsTopLabelExp, 203, u8) \
+_(ipPayloadLength, 204, u32) \
+_(udpMessageLength, 205, u16) \
+_(isMulticast, 206, u8) \
+_(ipv4IHL, 207, u8) \
+_(ipv4Options, 208, u32) \
+_(tcpOptions, 209, u64) \
+_(paddingOctets, 210, octetArray) \
+_(collectorIPv4Address, 211, ip4_address_t) \
+_(collectorIPv6Address, 212, ip6_address_t) \
+_(exportInterface, 213, u32) \
+_(exportProtocolVersion, 214, u8) \
+_(exportTransportProtocol, 215, u8) \
+_(collectorTransportPort, 216, u16) \
+_(exporterTransportPort, 217, u16) \
+_(tcpSynTotalCount, 218, u64) \
+_(tcpFinTotalCount, 219, u64) \
+_(tcpRstTotalCount, 220, u64) \
+_(tcpPshTotalCount, 221, u64) \
+_(tcpAckTotalCount, 222, u64) \
+_(tcpUrgTotalCount, 223, u64) \
+_(ipTotalLength, 224, u64) \
+_(postNATSourceIPv4Address, 225, ip4_address_t) \
+_(postNATDestinationIPv4Address, 226, ip4_address_t) \
+_(postNAPTSourceTransportPort, 227, u16) \
+_(postNAPTDestinationTransportPort, 228, u16) \
+_(natOriginatingAddressRealm, 229, u8) \
+_(natEvent, 230, u8) \
+_(initiatorOctets, 231, u64) \
+_(responderOctets, 232, u64) \
+_(firewallEvent, 233, u8) \
+_(ingressVRFID, 234, u32) \
+_(egressVRFID, 235, u32) \
+_(VRFname, 236, string) \
+_(postMplsTopLabelExp, 237, u8) \
+_(tcpWindowScale, 238, u16) \
+_(biflowDirection, 239, u8) \
+_(ethernetHeaderLength, 240, u8) \
+_(ethernetPayloadLength, 241, u16) \
+_(ethernetTotalLength, 242, u16) \
+_(dot1qVlanId, 243, u16) \
+_(dot1qPriority, 244, u8) \
+_(dot1qCustomerVlanId, 245, u16) \
+_(dot1qCustomerPriority, 246, u8) \
+_(metroEvcId, 247, string) \
+_(metroEvcType, 248, u8) \
+_(pseudoWireId, 249, u32) \
+_(pseudoWireType, 250, u16) \
+_(pseudoWireControlWord, 251, u32) \
+_(ingressPhysicalInterface, 252, u32) \
+_(egressPhysicalInterface, 253, u32) \
+_(postDot1qVlanId, 254, u16) \
+_(postDot1qCustomerVlanId, 255, u16) \
+_(ethernetType, 256, u16) \
+_(postIpPrecedence, 257, u8) \
+_(collectionTimeMilliseconds, 258, dateTimeMilliseconds) \
+_(exportSctpStreamId, 259, u16) \
+_(maxExportSeconds, 260, dateTimeSeconds) \
+_(maxFlowEndSeconds, 261, dateTimeSeconds) \
+_(messageMD5Checksum, 262, octetArray) \
+_(messageScope, 263, u8) \
+_(minExportSeconds, 264, dateTimeSeconds) \
+_(minFlowStartSeconds, 265, dateTimeSeconds) \
+_(opaqueOctets, 266, octetArray) \
+_(sessionScope, 267, u8) \
+_(maxFlowEndMicroseconds, 268, dateTimeMicroseconds) \
+_(maxFlowEndMilliseconds, 269, dateTimeMilliseconds) \
+_(maxFlowEndNanoseconds, 270, dateTimeNanoseconds) \
+_(minFlowStartMicroseconds, 271, dateTimeMicroseconds) \
+_(minFlowStartMilliseconds, 272, dateTimeMilliseconds) \
+_(minFlowStartNanoseconds, 273, dateTimeNanoseconds) \
+_(collectorCertificate, 274, octetArray) \
+_(exporterCertificate, 275, octetArray) \
+_(dataRecordsReliability, 276, boolean) \
+_(observationPointType, 277, u8) \
+_(newConnectionDeltaCount, 278, u32) \
+_(connectionSumDurationSeconds, 279, u64) \
+_(connectionTransactionId, 280, u64) \
+_(postNATSourceIPv6Address, 281, ip6_address_t) \
+_(postNATDestinationIPv6Address, 282, ip6_address_t) \
+_(natPoolId, 283, u32) \
+_(natPoolName, 284, string) \
+_(anonymizationFlags, 285, u16) \
+_(anonymizationTechnique, 286, u16) \
+_(informationElementIndex, 287, u16) \
+_(p2pTechnology, 288, string) \
+_(tunnelTechnology, 289, string) \
+_(encryptedTechnology, 290, string) \
+_(basicList, 291, basicList) \
+_(subTemplateList, 292, subTemplateList) \
+_(subTemplateMultiList, 293, subTemplateMultiList) \
+_(bgpValidityState, 294, u8) \
+_(IPSecSPI, 295, u32) \
+_(greKey, 296, u32) \
+_(natType, 297, u8) \
+_(initiatorPackets, 298, u64) \
+_(responderPackets, 299, u64) \
+_(observationDomainName, 300, string) \
+_(selectionSequenceId, 301, u64) \
+_(selectorId, 302, u64) \
+_(informationElementId, 303, u16) \
+_(selectorAlgorithm, 304, u16) \
+_(samplingPacketInterval, 305, u32) \
+_(samplingPacketSpace, 306, u32) \
+_(samplingTimeInterval, 307, u32) \
+_(samplingTimeSpace, 308, u32) \
+_(samplingSize, 309, u32) \
+_(samplingPopulation, 310, u32) \
+_(samplingProbability, 311, float64) \
+_(dataLinkFrameSize, 312, u16) \
+_(ipHeaderPacketSection, 313, octetArray) \
+_(ipPayloadPacketSection, 314, octetArray) \
+_(dataLinkFrameSection, 315, octetArray) \
+_(mplsLabelStackSection, 316, octetArray) \
+_(mplsPayloadPacketSection, 317, octetArray) \
+_(selectorIdTotalPktsObserved, 318, u64) \
+_(selectorIdTotalPktsSelected, 319, u64) \
+_(absoluteError, 320, float64) \
+_(relativeError, 321, float64) \
+_(observationTimeSeconds, 322, dateTimeSeconds) \
+_(observationTimeMilliseconds, 323, dateTimeMilliseconds) \
+_(observationTimeMicroseconds, 324, dateTimeMicroseconds) \
+_(observationTimeNanoseconds, 325, dateTimeNanoseconds) \
+_(digestHashValue, 326, u64) \
+_(hashIPPayloadOffset, 327, u64) \
+_(hashIPPayloadSize, 328, u64) \
+_(hashOutputRangeMin, 329, u64) \
+_(hashOutputRangeMax, 330, u64) \
+_(hashSelectedRangeMin, 331, u64) \
+_(hashSelectedRangeMax, 332, u64) \
+_(hashDigestOutput, 333, boolean) \
+_(hashInitialiserValue, 334, u64) \
+_(selectorName, 335, string) \
+_(upperCILimit, 336, float64) \
+_(lowerCILimit, 337, float64) \
+_(confidenceLevel, 338, float64) \
+_(informationElementDataType, 339, u8) \
+_(informationElementDescription, 340, string) \
+_(informationElementName, 341, string) \
+_(informationElementRangeBegin, 342, u64) \
+_(informationElementRangeEnd, 343, u64) \
+_(informationElementSemantics, 344, u8) \
+_(informationElementUnits, 345, u16) \
+_(privateEnterpriseNumber, 346, u32) \
+_(virtualStationInterfaceId, 347, octetArray) \
+_(virtualStationInterfaceName, 348, string) \
+_(virtualStationUUID, 349, octetArray) \
+_(virtualStationName, 350, string) \
+_(layer2SegmentId, 351, u64) \
+_(layer2OctetDeltaCount, 352, u64) \
+_(layer2OctetTotalCount, 353, u64) \
+_(ingressUnicastPacketTotalCount, 354, u64) \
+_(ingressMulticastPacketTotalCount, 355, u64) \
+_(ingressBroadcastPacketTotalCount, 356, u64) \
+_(egressUnicastPacketTotalCount, 357, u64) \
+_(egressBroadcastPacketTotalCount, 358, u64) \
+_(monitoringIntervalStartMilliSeconds, 359, dateTimeMilliseconds) \
+_(monitoringIntervalEndMilliSeconds, 360, dateTimeMilliseconds) \
+_(portRangeStart, 361, u16) \
+_(portRangeEnd, 362, u16) \
+_(portRangeStepSize, 363, u16) \
+_(portRangeNumPorts, 364, u16) \
+_(staMacAddress, 365, macAddress) \
+_(staIPv4Address, 366, ip4_address_t) \
+_(wtpMacAddress, 367, macAddress ) \
+_(ingressInterfaceType, 368, u32) \
+_(egressInterfaceType, 369, u32) \
+_(rtpSequenceNumber, 370, u16) \
+_(userName, 371, string) \
+_(applicationCategoryName, 372, string) \
+_(applicationSubCategoryName, 373, string) \
+_(applicationGroupName, 374, string) \
+_(originalFlowsPresent, 375, u64) \
+_(originalFlowsInitiated, 376, u64) \
+_(originalFlowsCompleted, 377, u64) \
+_(distinctCountOfSourceIPAddress, 378, u64) \
+_(distinctCountOfDestinationIPAddress, 379, u64) \
+_(distinctCountOfSourceIPv4Address, 380, u32) \
+_(distinctCountOfDestinationIPv4Address, 381, u32) \
+_(distinctCountOfSourceIPv6Address, 382, u64) \
+_(distinctCountOfDestinationIPv6Address, 383, u64) \
+_(valueDistributionMethod, 384, u8) \
+_(rfc3550JitterMilliseconds, 385, u32) \
+_(rfc3550JitterMicroseconds, 386, u32) \
+_(rfc3550JitterNanoseconds, 387, u32) \
+_(dot1qDEI, 388, boolean) \
+_(dot1qCustomerDEI, 389, boolean) \
+_(flowSelectorAlgorithm, 390, u16) \
+_(flowSelectedOctetDeltaCount, 391, u64) \
+_(flowSelectedPacketDeltaCount, 392, u64) \
+_(flowSelectedFlowDeltaCount, 393, u64) \
+_(selectorIDTotalFlowsObserved, 394, u64) \
+_(selectorIDTotalFlowsSelected, 395, u64) \
+_(samplingFlowInterval, 396, u64) \
+_(samplingFlowSpacing, 397, u64) \
+_(flowSamplingTimeInterval, 398, u64) \
+_(flowSamplingTimeSpacing, 399, u64) \
+_(hashFlowDomain, 400, u16) \
+_(transportOctetDeltaCount, 401, u64) \
+_(transportPacketDeltaCount, 402, u64) \
+_(originalExporterIPv4Address, 403, ip4_address_t) \
+_(originalExporterIPv6Address, 404, ip6_address_t) \
+_(originalObservationDomainId, 405, u32) \
+_(intermediateProcessId, 406, u32) \
+_(ignoredDataRecordTotalCount, 407, u64) \
+_(dataLinkFrameType, 408, u16) \
+_(sectionOffset, 409, u16) \
+_(sectionExportedOctets, 410, u16) \
+_(dot1qServiceInstanceTag, 411, octetArray) \
+_(dot1qServiceInstanceId, 412, u32) \
+_(dot1qServiceInstancePriority, 413, u8) \
+_(dot1qCustomerSourceMacAddress, 414, macAddress) \
+_(dot1qCustomerDestinationMacAddress, 415, macAddress) \
+_(postLayer2OctetDeltaCount, 417, u64) \
+_(postMCastLayer2OctetDeltaCount, 418, u64) \
+_(postLayer2OctetTotalCount, 420, u64) \
+_(postMCastLayer2OctetTotalCount, 421, u64) \
+_(minimumLayer2TotalLength, 422, u64) \
+_(maximumLayer2TotalLength, 423, u64) \
+_(droppedLayer2OctetDeltaCount, 424, u64) \
+_(droppedLayer2OctetTotalCount, 425, u64) \
+_(ignoredLayer2OctetTotalCount, 426, u64) \
+_(notSentLayer2OctetTotalCount, 427, u64) \
+_(layer2OctetDeltaSumOfSquares, 428, u64) \
+_(layer2OctetTotalSumOfSquares, 429, u64) \
+_(layer2FrameDeltaCount, 430, u64) \
+_(layer2FrameTotalCount, 431, u64) \
+_(pseudoWireDestinationIPv4Address, 432, ip4_address_t) \
+_(ignoredLayer2FrameTotalCount, 433, u64)
+
+typedef enum {
+#define _(n,v,t) n = v,
+ foreach_ipfix_info_element_t
+#undef _
+} ipfix_info_element_id_t;
+
+#endif /* __included_ipfix_info_elements_h__ */
diff --git a/vnet/vnet/flow/ipfix_packet.h b/vnet/vnet/flow/ipfix_packet.h
new file mode 100644
index 00000000000..329796191de
--- /dev/null
+++ b/vnet/vnet/flow/ipfix_packet.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ipfix_packet_h__
+#define __included_ipfix_packet_h__
+
+#include <vnet/flow/ipfix_info_elements.h>
+
+/* From RFC-7011:
+ * https://tools.ietf.org/html/rfc7011
+ */
+
+typedef struct {
+ u32 version_length;
+ u32 export_time;
+ u32 sequence_number;
+ u32 domain_id;
+} ipfix_message_header_t;
+
+static inline u32 version_length (u16 length)
+{
+ return clib_host_to_net_u32 (0x000a0000 | length);
+}
+
+
+/*
+ * The Field Specifier format is shown in Figure G.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |E| Information Element ident. | Field Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Enterprise Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Figure G: Field Specifier Format
+ *
+ * Where:
+ *
+ * E
+ *
+ * Enterprise bit. This is the first bit of the Field Specifier. If
+ * this bit is zero, the Information Element identifier identifies an
+ * Information Element in [IANA-IPFIX], and the four-octet Enterprise
+ * Number field MUST NOT be present. If this bit is one, the
+ * Information Element identifier identifies an enterprise-specific
+ * Information Element, and the Enterprise Number field MUST be
+ * present.
+ */
+
+typedef struct {
+ u32 e_id_length;
+ u32 enterprise;
+} ipfix_enterprise_field_specifier_t;
+
+typedef struct {
+ u32 e_id_length;
+} ipfix_field_specifier_t;
+
+static inline u32 ipfix_e_id_length (int e, u16 id, u16 length)
+{
+ u32 value;
+ value = (e<<31) | ((id&0x7FFF) <<16) | length;
+ return clib_host_to_net_u32 (value);
+}
+
+/*
+ * Every Set contains a common header. This header is defined in
+ * Figure I.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Set ID | Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Figure I: Set Header Format
+ *
+ * Each Set Header field is exported in network format. The fields are
+ * defined as follows:
+ *
+ * Set ID
+ *
+ * Identifies the Set. A value of 2 is reserved for Template Sets.
+ * A value of 3 is reserved for Options Template Sets. Values from 4
+ * to 255 are reserved for future use. Values 256 and above are used
+ * for Data Sets. The Set ID values of 0 and 1 are not used, for
+ * historical reasons [RFC3954].
+ *
+ * Length
+ *
+ * Total length of the Set, in octets, including the Set Header, all
+ * records, and the optional padding. Because an individual Set MAY
+ * contain multiple records, the Length value MUST be used to
+ * determine the position of the next Set.
+ */
+
+typedef struct {
+ u32 set_id_length;
+} ipfix_set_header_t;
+
+static inline u32 ipfix_set_id_length (u16 set_id, u16 length)
+{
+ return clib_host_to_net_u32 ((set_id<<16) | length);
+}
+
+/*
+ * The format of the Template Record is shown in Figure J. It consists
+ * of a Template Record Header and one or more Field Specifiers. Field
+ * Specifiers are defined in Figure G above.
+ *
+ * +--------------------------------------------------+
+ * | Template Record Header |
+ * +--------------------------------------------------+
+ * | Field Specifier |
+ * +--------------------------------------------------+
+ * | Field Specifier |
+ * +--------------------------------------------------+
+ * ...
+ * +--------------------------------------------------+
+ * | Field Specifier |
+ * +--------------------------------------------------+
+ *
+ * Figure J: Template Record Format
+ *
+ * The format of the Template Record Header is shown in Figure K.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Template ID (> 255) | Field Count |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Figure K: Template Record Header Format
+ *
+ * The Template Record Header Field definitions are as follows:
+ *
+ * Template ID
+ *
+ * Each Template Record is given a unique Template ID in the range
+ * 256 to 65535. This uniqueness is local to the Transport Session
+ * and Observation Domain that generated the Template ID. Since
+ * Template IDs are used as Set IDs in the Sets they describe (see
+ * Section 3.4.3), values 0-255 are reserved for special Set types
+ * (e.g., Template Sets themselves), and Templates and Options
+ * Templates (see Section 3.4.2) cannot share Template IDs within a
+ * Transport Session and Observation Domain. There are no
+ * constraints regarding the order of the Template ID allocation. As
+ * Exporting Processes are free to allocate Template IDs as they see
+ * fit, Collecting Processes MUST NOT assume incremental Template
+ * IDs, or anything about the contents of a Template based on its
+ * Template ID alone.
+ *
+ * Field Count
+ *
+ * Number of fields in this Template Record.
+ */
+
+typedef struct {
+ u32 id_count;
+} ipfix_template_header_t;
+
+static inline u32 ipfix_id_count (u16 id, u16 count)
+{
+ return clib_host_to_net_u32 ((id<<16) | count);
+}
+
+/* Template packet */
+typedef struct {
+ ipfix_message_header_t h;
+ ipfix_set_header_t s;
+ ipfix_template_header_t t;
+ ipfix_field_specifier_t fields[0];
+} ipfix_template_packet_t;
+
+#endif /* __included_ipfix_packet_h__ */
diff --git a/vnet/vnet/global_funcs.h b/vnet/vnet/global_funcs.h
new file mode 100644
index 00000000000..3958d88fc1e
--- /dev/null
+++ b/vnet/vnet/global_funcs.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * global_funcs.h: global data structure access functions
+ */
+
+#ifndef included_vnet_global_funcs_h_
+#define included_vnet_global_funcs_h_
+
+vnet_main_t * vnet_get_main (void);
+
+#endif /* included_vnet_global_funcs_h_ */
diff --git a/vnet/vnet/gre/error.def b/vnet/vnet/gre/error.def
new file mode 100644
index 00000000000..161ecc1d874
--- /dev/null
+++ b/vnet/vnet/gre/error.def
@@ -0,0 +1,23 @@
+/*
+ * gre_error.def: gre errors
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+gre_error (NONE, "no error")
+gre_error (UNKNOWN_PROTOCOL, "unknown protocol")
+gre_error (UNSUPPORTED_VERSION, "unsupported version")
+gre_error (PKTS_DECAP, "GRE input packets decapsulated")
+gre_error (PKTS_ENCAP, "GRE output packets encapsulated")
+gre_error (NO_SUCH_TUNNEL, "GRE input packets dropped due to missing tunnel")
diff --git a/vnet/vnet/gre/gre.c b/vnet/vnet/gre/gre.c
new file mode 100644
index 00000000000..c09816a2962
--- /dev/null
+++ b/vnet/vnet/gre/gre.c
@@ -0,0 +1,512 @@
+/*
+ * gre.c: gre
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/gre/gre.h>
+
+gre_main_t gre_main;
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4;
+ gre_header_t gre;
+}) ip4_and_gre_header_t;
+
+typedef struct {
+ union {
+ ip4_and_gre_header_t ip4_and_gre;
+ u64 as_u64[3];
+ };
+} ip4_and_gre_union_t;
+
+
+/* Packet trace structure */
+typedef struct {
+ /* Tunnel-id / index in tunnel vector */
+ u32 tunnel_id;
+
+ /* pkt length */
+ u32 length;
+
+ /* tunnel ip4 addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+} gre_tx_trace_t;
+
+u8 * format_gre_tx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ gre_tx_trace_t * t = va_arg (*args, gre_tx_trace_t *);
+
+ s = format (s, "GRE: tunnel %d len %d src %U dst %U",
+ t->tunnel_id, clib_net_to_host_u16 (t->length),
+ format_ip4_address, &t->src.as_u8,
+ format_ip4_address, &t->dst.as_u8);
+ return s;
+}
+
+u8 * format_gre_protocol (u8 * s, va_list * args)
+{
+ gre_protocol_t p = va_arg (*args, u32);
+ gre_main_t * gm = &gre_main;
+ gre_protocol_info_t * pi = gre_get_protocol_info (gm, p);
+
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "0x%04x", p);
+
+ return s;
+}
+
+u8 * format_gre_header_with_length (u8 * s, va_list * args)
+{
+ gre_main_t * gm = &gre_main;
+ gre_header_t * h = va_arg (*args, gre_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ gre_protocol_t p = clib_net_to_host_u16 (h->protocol);
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "gre header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "GRE %U", format_gre_protocol, p);
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ {
+ gre_protocol_info_t * pi = gre_get_protocol_info (gm, p);
+ vlib_node_t * node = vlib_get_node (gm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_gre_header (u8 * s, va_list * args)
+{
+ gre_header_t * h = va_arg (*args, gre_header_t *);
+ return format (s, "%U", format_gre_header_with_length, h, 0);
+}
+
+/* Returns gre protocol as an int in host byte order. */
+uword
+unformat_gre_protocol_host_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ gre_main_t * gm = &gre_main;
+ int i;
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ gm->protocol_info_by_name, &i))
+ {
+ gre_protocol_info_t * pi = vec_elt_at_index (gm->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_gre_protocol_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ if (! unformat_user (input, unformat_gre_protocol_host_byte_order, result))
+ return 0;
+ *result = clib_host_to_net_u16 ((u16) *result);
+ return 1;
+}
+
+uword
+unformat_gre_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ gre_header_t _h, * h = &_h;
+ u16 p;
+
+ if (! unformat (input, "%U",
+ unformat_gre_protocol_host_byte_order, &p))
+ return 0;
+
+ h->protocol = clib_host_to_net_u16 (p);
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static uword gre_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ /*
+ * Conundrum: packets from tun/tap destined for the tunnel
+ * actually have this rewrite applied. Transit packets do not.
+ * To make the two cases equivalent, don't generate a
+ * rewrite here, build the entire header in the fast path.
+ */
+ return 0;
+
+#ifdef THINGS_WORKED_AS_ONE_MIGHT_LIKE
+ ip4_and_gre_header_t * h = rewrite;
+ gre_protocol_t protocol;
+
+ if (max_rewrite_bytes < sizeof (h[0]))
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: protocol = GRE_PROTOCOL_##b; break
+ _ (IP4, ip4);
+ _ (IP6, ip6);
+#undef _
+ default:
+ return 0;
+ }
+
+ memset (h, 0, sizeof (*h));
+ h->ip4.ip_version_and_header_length = 0x45;
+ h->ip4.ttl = 64;
+ h->ip4.protocol = IP_PROTOCOL_GRE;
+ h->gre.protocol = clib_host_to_net_u16 (protocol);
+
+ return sizeof (h[0]);
+#endif
+}
+
+static uword
+gre_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ gre_main_t * gm = &gre_main;
+ u32 next_index;
+ u32 * from, * to_next, n_left_from, n_left_to_next;
+ vnet_interface_output_runtime_t * rd = (void *) node->runtime_data;
+ gre_tunnel_t *t = pool_elt_at_index (gm->tunnels, rd->dev_instance);
+
+ /* Vector of buffer / pkt indices we're supposed to process */
+ from = vlib_frame_vector_args (frame);
+
+ /* Number of buffers / pkts */
+ n_left_from = frame->n_vectors;
+
+ /* Speculatively send the first buffer to the last disposition we used */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ /* set up to enqueue to our disposition with index = next_index */
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ /*
+ * As long as we have enough pkts left to process two pkts
+ * and prefetch two pkts...
+ */
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ ip4_header_t * ip0, * ip1;
+ ip4_and_gre_union_t * h0, * h1;
+ u32 bi0, next0, bi1, next1;
+ __attribute__((unused)) u8 error0, error1;
+ u16 gre_protocol0, gre_protocol1;
+
+ /* Prefetch the next iteration */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ /*
+ * Prefetch packet data. We expect to overwrite
+ * the inbound L2 header with an ip header and a
+ * gre header. Might want to prefetch the last line
+ * of rewrite space as well; need profile data
+ */
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* Pick up the next two buffer indices */
+ bi0 = from[0];
+ bi1 = from[1];
+
+ /* Speculatively enqueue them where we sent the last buffer */
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = t->outer_fib_index;
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = t->outer_fib_index;
+
+ ip0 = vlib_buffer_get_current (b0);
+ gre_protocol0 = clib_net_to_host_u16 (0x800);
+ gre_protocol0 =
+ ((ip0->ip_version_and_header_length & 0xF0) == 0x60) ?
+ 0x86DD : gre_protocol0;
+
+ ip1 = vlib_buffer_get_current (b1);
+ gre_protocol1 = clib_net_to_host_u16 (0x800);
+ gre_protocol1 =
+ ((ip1->ip_version_and_header_length & 0xF0) == 0x60) ?
+ 0x86DD : gre_protocol1;
+
+ vlib_buffer_advance (b0, -sizeof(*h0));
+ vlib_buffer_advance (b1, -sizeof(*h1));
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+ h0->as_u64[0] = 0;
+ h0->as_u64[1] = 0;
+ h0->as_u64[2] = 0;
+
+ h1->as_u64[0] = 0;
+ h1->as_u64[1] = 0;
+ h1->as_u64[2] = 0;
+
+ ip0 = &h0->ip4_and_gre.ip4;
+ h0->ip4_and_gre.gre.protocol = gre_protocol0;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_GRE;
+
+ ip1 = &h1->ip4_and_gre.ip4;
+ h1->ip4_and_gre.gre.protocol = gre_protocol1;
+ ip1->ip_version_and_header_length = 0x45;
+ ip1->ttl = 254;
+ ip1->protocol = IP_PROTOCOL_GRE;
+
+ ip0->length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+ ip1->length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
+ ip0->src_address.as_u32 = t->tunnel_src.as_u32;
+ ip1->src_address.as_u32 = t->tunnel_src.as_u32;
+ ip0->dst_address.as_u32 = t->tunnel_dst.as_u32;
+ ip1->dst_address.as_u32 = t->tunnel_dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+ ip1->checksum = ip4_header_checksum (ip1);
+
+ /* ip4_lookup will route to the tunnel partner */
+ next0 = GRE_OUTPUT_NEXT_LOOKUP;
+ next1 = GRE_OUTPUT_NEXT_LOOKUP;
+ error0 = GRE_ERROR_NONE;
+ error1 = GRE_ERROR_NONE;
+
+ /*
+ * Enqueue 2 pkts. This macro deals with next0 != next1,
+ * acquiring enqueue rights to the indicated next
+ * node input frame, etc.
+ */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * b0;
+ ip4_header_t * ip0;
+ ip4_and_gre_union_t * h0;
+ u32 bi0, next0;
+ __attribute__((unused)) u8 error0;
+ u16 gre_protocol0;
+
+ bi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = t->outer_fib_index;
+ ip0 = vlib_buffer_get_current (b0);
+ gre_protocol0 = clib_net_to_host_u16 (0x800);
+ gre_protocol0 =
+ ((ip0->ip_version_and_header_length & 0xF0) == 0x60) ?
+ 0x86DD : gre_protocol0;
+
+ vlib_buffer_advance (b0, -sizeof(*h0));
+
+ h0 = vlib_buffer_get_current (b0);
+ h0->as_u64[0] = 0;
+ h0->as_u64[1] = 0;
+ h0->as_u64[2] = 0;
+
+ ip0 = &h0->ip4_and_gre.ip4;
+ h0->ip4_and_gre.gre.protocol = gre_protocol0;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_GRE;
+ ip0->length =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+ ip0->src_address.as_u32 = t->tunnel_src.as_u32;
+ ip0->dst_address.as_u32 = t->tunnel_dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ next0 = GRE_OUTPUT_NEXT_LOOKUP;
+ error0 = GRE_ERROR_NONE;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ gre_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_id = t - gm->tunnels;
+ tr->length = ip0->length;
+ tr->src.as_u32 = ip0->src_address.as_u32;
+ tr->dst.as_u32 = ip0->dst_address.as_u32;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, gre_input_node.index,
+ GRE_ERROR_PKTS_ENCAP, frame->n_vectors);
+
+ return frame->n_vectors;
+}
+
+static u8 * format_gre_tunnel_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "gre%d", dev_instance);
+}
+
+static u8 * format_gre_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ CLIB_UNUSED (int verbose) = va_arg (*args, int);
+
+ s = format (s, "GRE tunnel: id %d\n", dev_instance);
+ return s;
+}
+
+VNET_DEVICE_CLASS (gre_device_class) = {
+ .name = "GRE tunnel device",
+ .format_device_name = format_gre_tunnel_name,
+ .format_device = format_gre_device,
+ .format_tx_trace = format_gre_tx_trace,
+ .tx_function = gre_interface_tx,
+#ifdef SOON
+ .clear counter = 0;
+ .admin_up_down_function = 0;
+#endif
+};
+
+
+VNET_HW_INTERFACE_CLASS (gre_hw_interface_class) = {
+ .name = "GRE",
+ .format_header = format_gre_header_with_length,
+ .unformat_header = unformat_gre_header,
+ .set_rewrite = gre_set_rewrite,
+};
+
+static void add_protocol (gre_main_t * gm,
+ gre_protocol_t protocol,
+ char * protocol_name)
+{
+ gre_protocol_info_t * pi;
+ u32 i;
+
+ vec_add2 (gm->protocol_infos, pi, 1);
+ i = pi - gm->protocol_infos;
+
+ pi->name = protocol_name;
+ pi->protocol = protocol;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (gm->protocol_info_by_protocol, protocol, i);
+ hash_set_mem (gm->protocol_info_by_name, pi->name, i);
+}
+
+static clib_error_t * gre_init (vlib_main_t * vm)
+{
+ gre_main_t * gm = &gre_main;
+ clib_error_t * error;
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+
+ memset (gm, 0, sizeof (gm[0]));
+ gm->vlib_main = vm;
+ gm->vnet_main = vnet_get_main();
+
+ if ((error = vlib_call_init_function (vm, ip_main_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
+ return error;
+
+ /* Set up the ip packet generator */
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_GRE);
+ pi->format_header = format_gre_header;
+ pi->unformat_pg_edit = unformat_pg_gre_header;
+
+ gm->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ gm->protocol_info_by_protocol = hash_create (0, sizeof (uword));
+ gm->tunnel_by_key = hash_create (0, sizeof (uword));
+
+#define _(n,s) add_protocol (gm, GRE_PROTOCOL_##s, #s);
+ foreach_gre_protocol
+#undef _
+
+ return vlib_call_init_function (vm, gre_input_init);
+}
+
+VLIB_INIT_FUNCTION (gre_init);
+
+gre_main_t * gre_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, gre_init);
+ return &gre_main;
+}
+
diff --git a/vnet/vnet/gre/gre.h b/vnet/vnet/gre/gre.h
new file mode 100644
index 00000000000..c0689f60ddf
--- /dev/null
+++ b/vnet/vnet/gre/gre.h
@@ -0,0 +1,118 @@
+/*
+ * gre.h: types/functions for gre.
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_gre_h
+#define included_gre_h
+
+#include <vnet/vnet.h>
+#include <vnet/gre/packet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ip/format.h>
+
+vnet_hw_interface_class_t gre_hw_interface_class;
+
+typedef enum {
+#define gre_error(n,s) GRE_ERROR_##n,
+#include <vnet/gre/error.def>
+#undef gre_error
+ GRE_N_ERROR,
+} gre_error_t;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* GRE protocol type in host byte order. */
+ gre_protocol_t protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} gre_protocol_info_t;
+
+typedef struct {
+ ip4_address_t tunnel_src;
+ ip4_address_t tunnel_dst;
+ u32 outer_fib_index;
+ u32 hw_if_index;
+} gre_tunnel_t;
+
+typedef struct {
+ /* pool of tunnel instances */
+ gre_tunnel_t *tunnels;
+
+ gre_protocol_info_t * protocol_infos;
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * protocol_info_by_name, * protocol_info_by_protocol;
+ /* Hash mapping src/dst addr pair to tunnel */
+ uword * tunnel_by_key;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} gre_main_t;
+
+always_inline gre_protocol_info_t *
+gre_get_protocol_info (gre_main_t * em, gre_protocol_t protocol)
+{
+ uword * p = hash_get (em->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (em->protocol_infos, p[0]) : 0;
+}
+
+gre_main_t gre_main;
+
+/* Register given node index to take input for given gre type. */
+void
+gre_register_input_type (vlib_main_t * vm,
+ gre_protocol_t protocol,
+ u32 node_index);
+
+void gre_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ gre_protocol_t protocol);
+
+format_function_t format_gre_protocol;
+format_function_t format_gre_header;
+format_function_t format_gre_header_with_length;
+
+vlib_node_registration_t gre_input_node;
+vnet_device_class_t gre_device_class;
+
+/* Parse gre protocol as 0xXXXX or protocol name.
+ In either host or network byte order. */
+unformat_function_t unformat_gre_protocol_host_byte_order;
+unformat_function_t unformat_gre_protocol_net_byte_order;
+
+/* Parse gre header. */
+unformat_function_t unformat_gre_header;
+unformat_function_t unformat_pg_gre_header;
+
+void
+gre_register_input_protocol (vlib_main_t * vm,
+ gre_protocol_t protocol,
+ u32 node_index);
+
+/* manually added to the interface output node in gre.c */
+#define GRE_OUTPUT_NEXT_LOOKUP 1
+
+#endif /* included_gre_h */
diff --git a/vnet/vnet/gre/interface.c b/vnet/vnet/gre/interface.c
new file mode 100644
index 00000000000..72ce0962fc7
--- /dev/null
+++ b/vnet/vnet/gre/interface.c
@@ -0,0 +1,150 @@
+/*
+ * gre_interface.c: gre interfaces
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/gre/gre.h>
+
+int
+gre_register_interface (vnet_main_t * vnm,
+ u32 dev_class_index,
+ ip4_address_t *tunnel_src,
+ ip4_address_t *tunnel_dst,
+ u32 outer_fib_id,
+ u32 * gi_index_return)
+{
+ gre_main_t * gm = &gre_main;
+ ip4_main_t * im = &ip4_main;
+ gre_tunnel_t * t;
+ vnet_hw_interface_t * hi;
+ u32 hw_if_index;
+ u32 slot;
+ u32 outer_fib_index;
+ uword * p;
+
+ u64 key = (u64)tunnel_src->as_u32 << 32 | (u64)tunnel_dst->as_u32;
+
+ /* check if same src/dst pair exists */
+ if (hash_get (gm->tunnel_by_key, key))
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ p = hash_get (im->fib_index_by_table_id, outer_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ outer_fib_index = p[0];
+
+ pool_get (gm->tunnels, t);
+ memset (t, 0, sizeof (*t));
+
+ hw_if_index = vnet_register_interface
+ (vnm, gre_device_class.index, t - gm->tunnels,
+ gre_hw_interface_class.index,
+ t - gm->tunnels);
+
+ *gi_index_return = t - gm->tunnels;
+
+ t->hw_if_index = hw_if_index;
+ t->outer_fib_index = outer_fib_index;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ hi->min_packet_bytes = 64 + sizeof (gre_header_t) + sizeof (ip4_header_t);
+ hi->per_packet_overhead_bytes =
+ /* preamble */ 8 + /* inter frame gap */ 12;
+
+ /* Standard default gre MTU. */
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 1500;
+
+ memcpy (&t->tunnel_src, tunnel_src, sizeof (t->tunnel_src));
+ memcpy (&t->tunnel_dst, tunnel_dst, sizeof (t->tunnel_dst));
+
+ hash_set (gm->tunnel_by_key, key, t - gm->tunnels);
+
+ slot = vlib_node_add_named_next_with_slot
+ (vnm->vlib_main, hi->tx_node_index, "ip4-lookup", GRE_OUTPUT_NEXT_LOOKUP);
+
+ ASSERT (slot == GRE_OUTPUT_NEXT_LOOKUP);
+
+ return 0;
+}
+
+
+static clib_error_t *
+create_gre_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t src, dst;
+ u32 outer_fib_id = 0;
+ int rv;
+ u32 gi_index;
+ u32 num_m_args = 0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "src %U", unformat_ip4_address, &src))
+ num_m_args++;
+ else if (unformat (line_input, "dst %U", unformat_ip4_address, &dst))
+ num_m_args++;
+ else if (unformat (line_input, "outer-fib-id %d", &outer_fib_id))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (num_m_args < 2)
+ return clib_error_return (0, "mandatory argument(s) missing");
+
+ rv = gre_register_interface (vnm, gre_hw_interface_class.index,
+ &src, &dst, outer_fib_id, &gi_index);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_VALUE:
+ return clib_error_return (0, "GRE tunnel already exists...");
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "outer fib ID %d doesn't exist\n",
+ outer_fib_id);
+ default:
+ return clib_error_return (0, "gre_register_interface returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_gre_tunnel_command, static) = {
+ .path = "create gre tunnel",
+ .short_help = "create gre tunnel src <addr> dst <addr> [outer-fib-id <fib>]",
+ .function = create_gre_tunnel_command_fn,
+};
+
+/* force inclusion from application's main.c */
+clib_error_t *gre_interface_init (vlib_main_t *vm)
+{
+ return 0;
+}
+VLIB_INIT_FUNCTION(gre_interface_init);
diff --git a/vnet/vnet/gre/node.c b/vnet/vnet/gre/node.c
new file mode 100644
index 00000000000..7d07223fc71
--- /dev/null
+++ b/vnet/vnet/gre/node.c
@@ -0,0 +1,533 @@
+/*
+ * node.c: gre packet processing
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/gre/gre.h>
+#include <vppinfra/sparse_vec.h>
+
+#define foreach_gre_input_next \
+_(PUNT, "error-punt") \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input")
+
+typedef enum {
+#define _(s,n) GRE_INPUT_NEXT_##s,
+ foreach_gre_input_next
+#undef _
+ GRE_INPUT_N_NEXT,
+} gre_input_next_t;
+
+typedef struct {
+ u32 tunnel_id;
+ u32 length;
+ ip4_address_t src;
+ ip4_address_t dst;
+} gre_rx_trace_t;
+
+u8 * format_gre_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ gre_rx_trace_t * t = va_arg (*args, gre_rx_trace_t *);
+
+ s = format (s, "GRE: tunnel %d len %d src %U dst %U",
+ t->tunnel_id, clib_net_to_host_u16(t->length),
+ format_ip4_address, &t->src.as_u8,
+ format_ip4_address, &t->dst.as_u8);
+ return s;
+}
+
+typedef struct {
+ /* Sparse vector mapping gre protocol in network byte order
+ to next index. */
+ u16 * next_by_protocol;
+
+ u32 * sparse_index_by_next_index;
+} gre_input_runtime_t;
+
+static uword
+gre_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ gre_main_t * gm = &gre_main;
+ gre_input_runtime_t * rt = (void *) node->runtime_data;
+ __attribute__((unused)) u32 n_left_from, next_index, i_next, * from, * to_next;
+ u64 cached_tunnel_key = (u64) ~0;
+ u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index;
+ u32 cached_tunnel_fib_index = 0, tunnel_fib_index;
+
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+ i_next = vec_elt (rt->sparse_index_by_next_index, next_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ gre_header_t * h0, * h1;
+ u16 version0, version1;
+ int verr0, verr1;
+ u32 i0, i1, next0, next1, protocol0, protocol1;
+ ip4_header_t *ip0, *ip1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* ip4_local hands us the ip header, not the gre header */
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b1);
+
+ /* Save src + dst ip4 address, e.g. for mpls-o-gre */
+ vnet_buffer(b0)->gre.src = ip0->src_address.as_u32;
+ vnet_buffer(b0)->gre.dst = ip0->dst_address.as_u32;
+ vnet_buffer(b1)->gre.src = ip1->src_address.as_u32;
+ vnet_buffer(b1)->gre.dst = ip1->dst_address.as_u32;
+
+ vlib_buffer_advance (b0, sizeof (*ip0));
+ vlib_buffer_advance (b1, sizeof (*ip1));
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ /* Index sparse array with network byte order. */
+ protocol0 = h0->protocol;
+ protocol1 = h1->protocol;
+ sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1,
+ &i0, &i1);
+ next0 = vec_elt(rt->next_by_protocol, i0);
+ next1 = vec_elt(rt->next_by_protocol, i1);
+
+ b0->error = node->errors[next0 == SPARSE_VEC_INVALID_INDEX ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
+ b1->error = node->errors[next1 == SPARSE_VEC_INVALID_INDEX ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
+
+ version0 = clib_net_to_host_u16 (h0->flags_and_version);
+ verr0 = version0 & GRE_VERSION_MASK;
+ version1 = clib_net_to_host_u16 (h1->flags_and_version);
+ verr1 = version1 & GRE_VERSION_MASK;
+
+ b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
+ : b0->error;
+ next0 = verr0 ? GRE_INPUT_NEXT_DROP : next0;
+ b1->error = verr1 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
+ : b1->error;
+ next1 = verr1 ? GRE_INPUT_NEXT_DROP : next1;
+
+ /* RPF check for ip4/ip6 input */
+ if (PREDICT_FALSE(next0 == GRE_INPUT_NEXT_IP4_INPUT
+ || next0 == GRE_INPUT_NEXT_IP6_INPUT))
+ {
+ u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
+ (u64)(vnet_buffer(b0)->gre.src);
+
+ if (cached_tunnel_key != key)
+ {
+ vnet_hw_interface_t * hi;
+ gre_tunnel_t * t;
+ uword * p;
+
+ ip4_main_t * ip4m = &ip4_main;
+ p = hash_get (gm->tunnel_by_key, key);
+ if (!p)
+ {
+ next0 = GRE_INPUT_NEXT_DROP;
+ b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+ goto drop0;
+ }
+ t = pool_elt_at_index (gm->tunnels, p[0]);
+ hi = vnet_get_hw_interface (gm->vnet_main,
+ t->hw_if_index);
+ tunnel_sw_if_index = hi->sw_if_index;
+ tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+ tunnel_sw_if_index);
+
+ cached_tunnel_sw_if_index = tunnel_sw_if_index;
+ cached_tunnel_fib_index = tunnel_fib_index;
+ }
+ else
+ {
+ tunnel_sw_if_index = cached_tunnel_sw_if_index;
+ tunnel_fib_index = cached_tunnel_fib_index;
+ }
+
+ u32 len = vlib_buffer_length_in_chain (vm, b0);
+ vnet_interface_main_t *im = &gm->vnet_main->interface_main;
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ tunnel_sw_if_index,
+ 1 /* packets */,
+ len /* bytes */);
+
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+ }
+
+drop0:
+ if (PREDICT_FALSE(next1 == GRE_INPUT_NEXT_IP4_INPUT
+ || next1 == GRE_INPUT_NEXT_IP6_INPUT))
+ {
+ u64 key = ((u64)(vnet_buffer(b1)->gre.dst) << 32) |
+ (u64)(vnet_buffer(b1)->gre.src);
+
+ if (cached_tunnel_key != key)
+ {
+ vnet_hw_interface_t * hi;
+ gre_tunnel_t * t;
+ uword * p;
+
+ ip4_main_t * ip4m = &ip4_main;
+ p = hash_get (gm->tunnel_by_key, key);
+ if (!p)
+ {
+ next1 = GRE_INPUT_NEXT_DROP;
+ b1->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+ goto drop1;
+ }
+ t = pool_elt_at_index (gm->tunnels, p[0]);
+ hi = vnet_get_hw_interface (gm->vnet_main,
+ t->hw_if_index);
+ tunnel_sw_if_index = hi->sw_if_index;
+ tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+ tunnel_sw_if_index);
+
+ cached_tunnel_sw_if_index = tunnel_sw_if_index;
+ cached_tunnel_fib_index = tunnel_fib_index;
+ }
+ else
+ {
+ tunnel_sw_if_index = cached_tunnel_sw_if_index;
+ tunnel_fib_index = cached_tunnel_fib_index;
+ }
+
+ u32 len = vlib_buffer_length_in_chain (vm, b1);
+ vnet_interface_main_t *im = &gm->vnet_main->interface_main;
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ tunnel_sw_if_index,
+ 1 /* packets */,
+ len /* bytes */);
+
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+ }
+drop1:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ gre_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_id = ~0;
+ tr->length = ip0->length;
+ tr->src.as_u32 = ip0->src_address.as_u32;
+ tr->dst.as_u32 = ip0->dst_address.as_u32;
+ }
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ gre_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->tunnel_id = ~0;
+ tr->length = ip1->length;
+ tr->src.as_u32 = ip1->src_address.as_u32;
+ tr->dst.as_u32 = ip1->dst_address.as_u32;
+ }
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ gre_header_t * h0;
+ ip4_header_t * ip0;
+ u16 version0;
+ int verr0;
+ u32 i0, next0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+
+ vnet_buffer(b0)->gre.src = ip0->src_address.as_u32;
+ vnet_buffer(b0)->gre.dst = ip0->dst_address.as_u32;
+
+ vlib_buffer_advance (b0, sizeof (*ip0));
+
+ h0 = vlib_buffer_get_current (b0);
+
+ i0 = sparse_vec_index (rt->next_by_protocol, h0->protocol);
+ next0 = vec_elt(rt->next_by_protocol, i0);
+
+ b0->error =
+ node->errors[next0 == SPARSE_VEC_INVALID_INDEX
+ ? GRE_ERROR_UNKNOWN_PROTOCOL : GRE_ERROR_NONE];
+
+ version0 = clib_net_to_host_u16 (h0->flags_and_version);
+ verr0 = version0 & GRE_VERSION_MASK;
+ b0->error = verr0 ? node->errors[GRE_ERROR_UNSUPPORTED_VERSION]
+ : b0->error;
+ next0 = verr0 ? GRE_INPUT_NEXT_DROP : next0;
+
+ /* For IP payload we need to find source interface
+ so we can increase counters and help forward node to
+ pick right FIB */
+ if (PREDICT_FALSE(next0 == GRE_INPUT_NEXT_IP4_INPUT
+ || next0 == GRE_INPUT_NEXT_IP6_INPUT))
+ {
+ u64 key = ((u64)(vnet_buffer(b0)->gre.dst) << 32) |
+ (u64)(vnet_buffer(b0)->gre.src);
+
+ if (cached_tunnel_key != key)
+ {
+ vnet_hw_interface_t * hi;
+ gre_tunnel_t * t;
+ uword * p;
+
+ ip4_main_t * ip4m = &ip4_main;
+ p = hash_get (gm->tunnel_by_key, key);
+ if (!p)
+ {
+ next0 = GRE_INPUT_NEXT_DROP;
+ b0->error = node->errors[GRE_ERROR_NO_SUCH_TUNNEL];
+ goto drop;
+ }
+ t = pool_elt_at_index (gm->tunnels, p[0]);
+ hi = vnet_get_hw_interface (gm->vnet_main,
+ t->hw_if_index);
+ tunnel_sw_if_index = hi->sw_if_index;
+ tunnel_fib_index = vec_elt (ip4m->fib_index_by_sw_if_index,
+ tunnel_sw_if_index);
+
+ cached_tunnel_sw_if_index = tunnel_sw_if_index;
+ cached_tunnel_fib_index = tunnel_fib_index;
+ }
+ else
+ {
+ tunnel_sw_if_index = cached_tunnel_sw_if_index;
+ tunnel_fib_index = cached_tunnel_fib_index;
+ }
+
+ u32 len = vlib_buffer_length_in_chain (vm, b0);
+ vnet_interface_main_t *im = &gm->vnet_main->interface_main;
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ cpu_index,
+ tunnel_sw_if_index,
+ 1 /* packets */,
+ len /* bytes */);
+
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = tunnel_fib_index;
+ }
+
+drop:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ gre_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_id = ~0;
+ tr->length = ip0->length;
+ tr->src.as_u32 = ip0->src_address.as_u32;
+ tr->dst.as_u32 = ip0->dst_address.as_u32;
+ }
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, gre_input_node.index,
+ GRE_ERROR_PKTS_DECAP, from_frame->n_vectors);
+ return from_frame->n_vectors;
+}
+
+static char * gre_error_strings[] = {
+#define gre_error(n,s) s,
+#include "error.def"
+#undef gre_error
+};
+
+VLIB_REGISTER_NODE (gre_input_node) = {
+ .function = gre_input,
+ .name = "gre-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof (gre_input_runtime_t),
+
+ .n_errors = GRE_N_ERROR,
+ .error_strings = gre_error_strings,
+
+ .n_next_nodes = GRE_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [GRE_INPUT_NEXT_##s] = n,
+ foreach_gre_input_next
+#undef _
+ },
+
+ .format_buffer = format_gre_header_with_length,
+ .format_trace = format_gre_rx_trace,
+ .unformat_buffer = unformat_gre_header,
+};
+
+void
+gre_register_input_protocol (vlib_main_t * vm,
+ gre_protocol_t protocol,
+ u32 node_index)
+{
+ gre_main_t * em = &gre_main;
+ gre_protocol_info_t * pi;
+ gre_input_runtime_t * rt;
+ u16 * n;
+ u32 i;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, gre_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = gre_get_protocol_info (em, protocol);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ gre_input_node.index,
+ node_index);
+
+ /* Setup gre protocol -> next index sparse vector mapping. */
+ rt = vlib_node_get_runtime_data (vm, gre_input_node.index);
+ n = sparse_vec_validate (rt->next_by_protocol,
+ clib_host_to_net_u16 (protocol));
+ n[0] = pi->next_index;
+
+ /* Rebuild next index -> sparse index inverse mapping when sparse vector
+ is updated. */
+ vec_validate (rt->sparse_index_by_next_index, pi->next_index);
+ for (i = 1; i < vec_len (rt->next_by_protocol); i++)
+ rt->sparse_index_by_next_index[rt->next_by_protocol[i]] = i;
+}
+
+static void
+gre_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_gre_header_with_length;
+ n->unformat_buffer = unformat_gre_header;
+ pn->unformat_edit = unformat_pg_gre_header;
+}
+
+static clib_error_t * gre_input_init (vlib_main_t * vm)
+{
+ gre_input_runtime_t * rt;
+ vlib_node_t *ip4_input, *ip6_input, *mpls_unicast_input;
+
+ {
+ clib_error_t * error;
+ error = vlib_call_init_function (vm, gre_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ gre_setup_node (vm, gre_input_node.index);
+
+ rt = vlib_node_get_runtime_data (vm, gre_input_node.index);
+
+ rt->next_by_protocol = sparse_vec_new
+ (/* elt bytes */ sizeof (rt->next_by_protocol[0]),
+ /* bits in index */ BITS (((gre_header_t *) 0)->protocol));
+
+ vec_validate (rt->sparse_index_by_next_index, GRE_INPUT_NEXT_DROP);
+ vec_validate (rt->sparse_index_by_next_index, GRE_INPUT_NEXT_PUNT);
+ rt->sparse_index_by_next_index[GRE_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ rt->sparse_index_by_next_index[GRE_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+ /* These could be moved to the supported protocol input node defn's */
+ ip4_input = vlib_get_node_by_name (vm, (u8 *)"ip4-input");
+ ASSERT(ip4_input);
+ ip6_input = vlib_get_node_by_name (vm, (u8 *)"ip6-input");
+ ASSERT(ip6_input);
+ mpls_unicast_input = vlib_get_node_by_name (vm, (u8 *)"mpls-gre-input");
+ ASSERT(mpls_unicast_input);
+
+ gre_register_input_protocol (vm, GRE_PROTOCOL_ip4,
+ ip4_input->index);
+
+ gre_register_input_protocol (vm, GRE_PROTOCOL_ip6,
+ ip6_input->index);
+
+ gre_register_input_protocol (vm, GRE_PROTOCOL_mpls_unicast,
+ mpls_unicast_input->index);
+
+ ip4_register_protocol (IP_PROTOCOL_GRE, gre_input_node.index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (gre_input_init);
diff --git a/vnet/vnet/gre/packet.h b/vnet/vnet/gre/packet.h
new file mode 100644
index 00000000000..573f2624bf8
--- /dev/null
+++ b/vnet/vnet/gre/packet.h
@@ -0,0 +1,54 @@
+#ifndef included_vnet_gre_packet_h
+#define included_vnet_gre_packet_h
+
+/*
+ * GRE packet format
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define foreach_gre_protocol \
+_ (0x0800, ip4) \
+_ (0x86DD, ip6) \
+_ (0x0806, arp) \
+_ (0x8847, mpls_unicast) \
+_ (0x894F, nsh)
+
+typedef enum {
+#define _(n,f) GRE_PROTOCOL_##f = n,
+ foreach_gre_protocol
+#undef _
+} gre_protocol_t;
+
+typedef struct {
+ /* flags and version */
+ u16 flags_and_version;
+ /* unimplemented at the moment */
+#define GRE_FLAGS_CHECKSUM (1 << 15)
+
+ /* deprecated, according to rfc2784 */
+#define GRE_FLAGS_ROUTING (1 << 14)
+#define GRE_FLAGS_KEY (1 << 13)
+#define GRE_FLAGS_SEQUENCE (1 << 12)
+#define GRE_FLAGS_STRICT_SOURCE_ROUTE (1 << 11)
+
+ /* version 1 is PPTP which we don't support */
+#define GRE_SUPPORTED_VERSION 0
+#define GRE_VERSION_MASK 0x7
+
+ /* 0x800 for ip4, etc. */
+ u16 protocol;
+} gre_header_t;
+
+#endif /* included_vnet_gre_packet_h */
diff --git a/vnet/vnet/gre/pg.c b/vnet/vnet/gre/pg.c
new file mode 100644
index 00000000000..cc065d3b6b5
--- /dev/null
+++ b/vnet/vnet/gre/pg.c
@@ -0,0 +1,77 @@
+/*
+ * hdlc_pg.c: packet generator gre interface
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/gre/gre.h>
+
+typedef struct {
+ pg_edit_t flags_and_version;
+ pg_edit_t protocol;
+} pg_gre_header_t;
+
+static inline void
+pg_gre_header_init (pg_gre_header_t * e)
+{
+ pg_edit_init (&e->flags_and_version, gre_header_t, flags_and_version);
+ pg_edit_init (&e->protocol, gre_header_t, protocol);
+}
+
+uword
+unformat_pg_gre_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_gre_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (gre_header_t),
+ &group_index);
+ pg_gre_header_init (h);
+
+ pg_edit_set_fixed (&h->flags_and_version, 0);
+
+ error = 1;
+ if (! unformat (input, "%U",
+ unformat_pg_edit,
+ unformat_gre_protocol_net_byte_order, &h->protocol))
+ goto done;
+
+ {
+ gre_main_t * pm = &gre_main;
+ gre_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->protocol.type == PG_EDIT_FIXED)
+ {
+ u16 t = *(u16 *) h->protocol.values[PG_EDIT_LO];
+ pi = gre_get_protocol_info (pm, clib_net_to_host_u16 (t));
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/hdlc/error.def b/vnet/vnet/hdlc/error.def
new file mode 100644
index 00000000000..16e001bbcfa
--- /dev/null
+++ b/vnet/vnet/hdlc/error.def
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * hdlc_error.def: hdlc errors
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+hdlc_error (NONE, "no error")
+hdlc_error (UNKNOWN_PROTOCOL, "unknown hdlc protocol")
+hdlc_error (UNKNOWN_ADDRESS_CONTROL, "address, control != 0x0f00")
diff --git a/vnet/vnet/hdlc/hdlc.c b/vnet/vnet/hdlc/hdlc.c
new file mode 100644
index 00000000000..bdbe89466b9
--- /dev/null
+++ b/vnet/vnet/hdlc/hdlc.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * hdlc.c: hdlc
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/hdlc/hdlc.h>
+
+/* Global main structure. */
+hdlc_main_t hdlc_main;
+
+u8 * format_hdlc_protocol (u8 * s, va_list * args)
+{
+ hdlc_protocol_t p = va_arg (*args, u32);
+ hdlc_main_t * pm = &hdlc_main;
+ hdlc_protocol_info_t * pi = hdlc_get_protocol_info (pm, p);
+
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "0x%04x", p);
+
+ return s;
+}
+
+u8 * format_hdlc_header_with_length (u8 * s, va_list * args)
+{
+ hdlc_main_t * pm = &hdlc_main;
+ hdlc_header_t * h = va_arg (*args, hdlc_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ hdlc_protocol_t p = clib_net_to_host_u16 (h->protocol);
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "hdlc header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "HDLC %U", format_hdlc_protocol, p);
+
+ if (h->address != 0xff)
+ s = format (s, ", address 0x%02x", h->address);
+ if (h->control != 0x03)
+ s = format (s, ", control 0x%02x", h->control);
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ {
+ hdlc_protocol_info_t * pi = hdlc_get_protocol_info (pm, p);
+ vlib_node_t * node = vlib_get_node (pm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_hdlc_header (u8 * s, va_list * args)
+{
+ hdlc_header_t * h = va_arg (*args, hdlc_header_t *);
+ return format (s, "%U", format_hdlc_header_with_length, h, 0);
+}
+
+/* Returns hdlc protocol as an int in host byte order. */
+uword
+unformat_hdlc_protocol_host_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ hdlc_main_t * pm = &hdlc_main;
+ int p, i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x", &p)
+ || unformat (input, "%d", &p))
+ {
+ if (p >= (1 << 16))
+ return 0;
+ *result = p;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ pm->protocol_info_by_name, &i))
+ {
+ hdlc_protocol_info_t * pi = vec_elt_at_index (pm->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_hdlc_protocol_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ if (! unformat_user (input, unformat_hdlc_protocol_host_byte_order, result))
+ return 0;
+ *result = clib_host_to_net_u16 ((u16) *result);
+ return 1;
+}
+
+uword
+unformat_hdlc_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ hdlc_header_t _h, * h = &_h;
+ u16 p;
+
+ if (! unformat (input, "%U",
+ unformat_hdlc_protocol_host_byte_order, &p))
+ return 0;
+
+ h->address = 0xff;
+ h->control = 0x03;
+ h->protocol = clib_host_to_net_u16 (p);
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static uword hdlc_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ hdlc_header_t * h = rewrite;
+ hdlc_protocol_t protocol;
+
+ if (max_rewrite_bytes < sizeof (h[0]))
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: protocol = HDLC_PROTOCOL_##b; break
+ _ (IP4, ip4);
+ _ (IP6, ip6);
+ _ (MPLS_UNICAST, mpls_unicast);
+ _ (MPLS_MULTICAST, mpls_multicast);
+#undef _
+ default:
+ return 0;
+ }
+
+ h->address = 0x0f;
+ h->control = 0x00;
+ h->protocol = clib_host_to_net_u16 (protocol);
+
+ return sizeof (h[0]);
+}
+
+VNET_HW_INTERFACE_CLASS (hdlc_hw_interface_class) = {
+ .name = "HDLC",
+ .format_header = format_hdlc_header_with_length,
+ .unformat_header = unformat_hdlc_header,
+ .set_rewrite = hdlc_set_rewrite,
+};
+
+static void add_protocol (hdlc_main_t * pm,
+ hdlc_protocol_t protocol,
+ char * protocol_name)
+{
+ hdlc_protocol_info_t * pi;
+ u32 i;
+
+ vec_add2 (pm->protocol_infos, pi, 1);
+ i = pi - pm->protocol_infos;
+
+ pi->name = protocol_name;
+ pi->protocol = protocol;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (pm->protocol_info_by_protocol, protocol, i);
+ hash_set_mem (pm->protocol_info_by_name, pi->name, i);
+}
+
+static clib_error_t * hdlc_init (vlib_main_t * vm)
+{
+ hdlc_main_t * pm = &hdlc_main;
+
+ memset (pm, 0, sizeof (pm[0]));
+ pm->vlib_main = vm;
+
+ pm->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ pm->protocol_info_by_protocol = hash_create (0, sizeof (uword));
+
+#define _(n,s) add_protocol (pm, HDLC_PROTOCOL_##s, #s);
+ foreach_hdlc_protocol
+#undef _
+
+ return vlib_call_init_function (vm, hdlc_input_init);
+}
+
+VLIB_INIT_FUNCTION (hdlc_init);
+
+hdlc_main_t * hdlc_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, hdlc_init);
+ return &hdlc_main;
+}
+
diff --git a/vnet/vnet/hdlc/hdlc.h b/vnet/vnet/hdlc/hdlc.h
new file mode 100644
index 00000000000..e5cbe62d33d
--- /dev/null
+++ b/vnet/vnet/hdlc/hdlc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * hdlc.h: types/functions for hdlc.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_hdlc_h
+#define included_hdlc_h
+
+#include <vnet/vnet.h>
+#include <vnet/hdlc/packet.h>
+#include <vnet/pg/pg.h>
+
+vnet_hw_interface_class_t hdlc_hw_interface_class;
+
+typedef enum {
+#define hdlc_error(n,s) HDLC_ERROR_##n,
+#include <vnet/hdlc/error.def>
+#undef hdlc_error
+ HDLC_N_ERROR,
+} hdlc_error_t;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* HDLC protocol type in host byte order. */
+ hdlc_protocol_t protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} hdlc_protocol_info_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ hdlc_protocol_info_t * protocol_infos;
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * protocol_info_by_name, * protocol_info_by_protocol;
+} hdlc_main_t;
+
+always_inline hdlc_protocol_info_t *
+hdlc_get_protocol_info (hdlc_main_t * em, hdlc_protocol_t protocol)
+{
+ uword * p = hash_get (em->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (em->protocol_infos, p[0]) : 0;
+}
+
+extern hdlc_main_t hdlc_main;
+
+/* Register given node index to take input for given hdlc type. */
+void
+hdlc_register_input_type (vlib_main_t * vm,
+ hdlc_protocol_t protocol,
+ u32 node_index);
+
+void hdlc_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ hdlc_protocol_t protocol);
+
+format_function_t format_hdlc_protocol;
+format_function_t format_hdlc_header;
+format_function_t format_hdlc_header_with_length;
+
+/* Parse hdlc protocol as 0xXXXX or protocol name.
+ In either host or network byte order. */
+unformat_function_t unformat_hdlc_protocol_host_byte_order;
+unformat_function_t unformat_hdlc_protocol_net_byte_order;
+
+/* Parse hdlc header. */
+unformat_function_t unformat_hdlc_header;
+unformat_function_t unformat_pg_hdlc_header;
+
+always_inline void
+hdlc_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_hdlc_header_with_length;
+ n->unformat_buffer = unformat_hdlc_header;
+ pn->unformat_edit = unformat_pg_hdlc_header;
+}
+
+void
+hdlc_register_input_protocol (vlib_main_t * vm,
+ hdlc_protocol_t protocol,
+ u32 node_index);
+
+#endif /* included_hdlc_h */
diff --git a/vnet/vnet/hdlc/node.c b/vnet/vnet/hdlc/node.c
new file mode 100644
index 00000000000..4fe0296aca1
--- /dev/null
+++ b/vnet/vnet/hdlc/node.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * hdlc_node.c: hdlc packet processing
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/hdlc/hdlc.h>
+#include <vppinfra/sparse_vec.h>
+
+#define foreach_hdlc_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) HDLC_INPUT_NEXT_##s,
+ foreach_hdlc_input_next
+#undef _
+ HDLC_INPUT_N_NEXT,
+} hdlc_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} hdlc_input_trace_t;
+
+static u8 * format_hdlc_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ hdlc_input_trace_t * t = va_arg (*va, hdlc_input_trace_t *);
+
+ s = format (s, "%U", format_hdlc_header, t->packet_data);
+
+ return s;
+}
+
+typedef struct {
+ /* Sparse vector mapping hdlc protocol in network byte order
+ to next index. */
+ u16 * next_by_protocol;
+
+ u32 * sparse_index_by_next_index;
+} hdlc_input_runtime_t;
+
+static uword
+hdlc_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ hdlc_input_runtime_t * rt = (void *) node->runtime_data;
+ u32 n_left_from, next_index, i_next, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (hdlc_input_trace_t));
+
+ next_index = node->cached_next_index;
+ i_next = vec_elt (rt->sparse_index_by_next_index, next_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ hdlc_header_t * h0, * h1;
+ u32 i0, i1, len0, len1, protocol0, protocol1, enqueue_code;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+
+ CLIB_PREFETCH (b2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = (void *) (b0->data + b0->current_data);
+ h1 = (void *) (b1->data + b1->current_data);
+
+ protocol0 = h0->protocol;
+ protocol1 = h1->protocol;
+
+ /* Add padding bytes for OSI protocols. */
+ len0 = sizeof (h0[0]);
+ len1 = sizeof (h1[0]);
+
+ len0 += protocol0 == clib_host_to_net_u16 (HDLC_PROTOCOL_osi);
+ len1 += protocol1 == clib_host_to_net_u16 (HDLC_PROTOCOL_osi);
+
+ b0->current_data += len0;
+ b1->current_data += len1;
+
+ b0->current_length -= len0;
+ b1->current_length -= len1;
+
+ /* Index sparse array with network byte order. */
+ sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1, &i0, &i1);
+
+ b0->error = node->errors[i0 == SPARSE_VEC_INVALID_INDEX ? HDLC_ERROR_UNKNOWN_PROTOCOL : HDLC_ERROR_NONE];
+ b1->error = node->errors[i1 == SPARSE_VEC_INVALID_INDEX ? HDLC_ERROR_UNKNOWN_PROTOCOL : HDLC_ERROR_NONE];
+
+ enqueue_code = (i0 != i_next) + 2*(i1 != i_next);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = bi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i0), bi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i1), bi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i0), bi0);
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i1), bi1);
+ if (i0 == i1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ i_next = i1;
+ next_index = vec_elt (rt->next_by_protocol, i_next);
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ hdlc_header_t * h0;
+ u32 i0, len0, protocol0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ protocol0 = h0->protocol;
+
+ /* Add padding bytes for OSI protocols. */
+ len0 = sizeof (h0[0]);
+ len0 += protocol0 == clib_host_to_net_u16 (HDLC_PROTOCOL_osi);
+
+ b0->current_data += len0;
+ b0->current_length -= len0;
+
+ i0 = sparse_vec_index (rt->next_by_protocol, protocol0);
+
+ b0->error = node->errors[i0 == SPARSE_VEC_INVALID_INDEX ? HDLC_ERROR_UNKNOWN_PROTOCOL : HDLC_ERROR_NONE];
+
+ /* Sent packet to wrong next? */
+ if (PREDICT_FALSE (i0 != i_next))
+ {
+ /* Return old frame; remove incorrectly enqueued packet. */
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);
+
+ /* Send to correct next. */
+ i_next = i0;
+ next_index = vec_elt (rt->next_by_protocol, i_next);
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * hdlc_error_strings[] = {
+#define hdlc_error(n,s) s,
+#include "error.def"
+#undef hdlc_error
+};
+
+VLIB_REGISTER_NODE (hdlc_input_node) = {
+ .function = hdlc_input,
+ .name = "hdlc-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof (hdlc_input_runtime_t),
+
+ .n_errors = HDLC_N_ERROR,
+ .error_strings = hdlc_error_strings,
+
+ .n_next_nodes = HDLC_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [HDLC_INPUT_NEXT_##s] = n,
+ foreach_hdlc_input_next
+#undef _
+ },
+
+ .format_buffer = format_hdlc_header_with_length,
+ .format_trace = format_hdlc_input_trace,
+ .unformat_buffer = unformat_hdlc_header,
+};
+
+static clib_error_t * hdlc_input_init (vlib_main_t * vm)
+{
+ hdlc_input_runtime_t * rt;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, hdlc_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ hdlc_setup_node (vm, hdlc_input_node.index);
+
+ rt = vlib_node_get_runtime_data (vm, hdlc_input_node.index);
+
+ rt->next_by_protocol = sparse_vec_new
+ (/* elt bytes */ sizeof (rt->next_by_protocol[0]),
+ /* bits in index */ BITS (((hdlc_header_t *) 0)->protocol));
+
+ vec_validate (rt->sparse_index_by_next_index, HDLC_INPUT_NEXT_DROP);
+ vec_validate (rt->sparse_index_by_next_index, HDLC_INPUT_NEXT_PUNT);
+ rt->sparse_index_by_next_index[HDLC_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ rt->sparse_index_by_next_index[HDLC_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (hdlc_input_init);
+
+void
+hdlc_register_input_protocol (vlib_main_t * vm,
+ hdlc_protocol_t protocol,
+ u32 node_index)
+{
+ hdlc_main_t * em = &hdlc_main;
+ hdlc_protocol_info_t * pi;
+ hdlc_input_runtime_t * rt;
+ u16 * n;
+ u32 i;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, hdlc_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = hdlc_get_protocol_info (em, protocol);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ hdlc_input_node.index,
+ node_index);
+
+ /* Setup hdlc protocol -> next index sparse vector mapping. */
+ rt = vlib_node_get_runtime_data (vm, hdlc_input_node.index);
+ n = sparse_vec_validate (rt->next_by_protocol, clib_host_to_net_u16 (protocol));
+ n[0] = pi->next_index;
+
+ /* Rebuild next index -> sparse index inverse mapping when sparse vector
+ is updated. */
+ vec_validate (rt->sparse_index_by_next_index, pi->next_index);
+ for (i = 1; i < vec_len (rt->next_by_protocol); i++)
+ rt->sparse_index_by_next_index[rt->next_by_protocol[i]] = i;
+}
diff --git a/vnet/vnet/hdlc/packet.h b/vnet/vnet/hdlc/packet.h
new file mode 100644
index 00000000000..45e5496fbc8
--- /dev/null
+++ b/vnet/vnet/hdlc/packet.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_hdlc_packet_h
+#define included_vnet_hdlc_packet_h
+
+/*
+ * HDLC packet format
+ *
+ * Copyright (c) 2009 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define foreach_hdlc_protocol \
+ _ (0x0800, ip4) \
+ _ (0x2000, cdp) \
+ _ (0x8035, slarp) \
+ _ (0x8847, mpls_unicast) \
+ _ (0x8848, mpls_multicast) \
+ _ (0x86dd, ip6) \
+ _ (0xfefe, osi)
+
+typedef enum {
+#define _(n,f) HDLC_PROTOCOL_##f = n,
+ foreach_hdlc_protocol
+#undef _
+} hdlc_protocol_t;
+
+typedef struct {
+ /* Set to 0x0f for unicast; 0x8f for broadcast. */
+ u8 address;
+
+ /* Always zero. */
+ u8 control;
+
+ /* Layer 3 protocol for this packet. */
+ u16 protocol;
+
+ /* Layer 3 payload. */
+ u8 payload[0];
+} hdlc_header_t;
+
+#endif /* included_vnet_hdlc_packet_h */
diff --git a/vnet/vnet/hdlc/pg.c b/vnet/vnet/hdlc/pg.c
new file mode 100644
index 00000000000..b8e67022b08
--- /dev/null
+++ b/vnet/vnet/hdlc/pg.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * hdlc_pg.c: packet generator hdlc interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/hdlc/hdlc.h>
+
+typedef struct {
+ pg_edit_t address;
+ pg_edit_t control;
+ pg_edit_t protocol;
+} pg_hdlc_header_t;
+
+static inline void
+pg_hdlc_header_init (pg_hdlc_header_t * e)
+{
+ pg_edit_init (&e->address, hdlc_header_t, address);
+ pg_edit_init (&e->control, hdlc_header_t, control);
+ pg_edit_init (&e->protocol, hdlc_header_t, protocol);
+}
+
+uword
+unformat_pg_hdlc_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_hdlc_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (hdlc_header_t),
+ &group_index);
+ pg_hdlc_header_init (h);
+
+ pg_edit_set_fixed (&h->address, 0x0f);
+ pg_edit_set_fixed (&h->control, 0x00);
+
+ error = 1;
+ if (! unformat (input, "%U",
+ unformat_pg_edit,
+ unformat_hdlc_protocol_net_byte_order, &h->protocol))
+ goto done;
+
+ {
+ hdlc_main_t * pm = &hdlc_main;
+ hdlc_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->protocol.type == PG_EDIT_FIXED)
+ {
+ u16 t = *(u16 *) h->protocol.values[PG_EDIT_LO];
+ pi = hdlc_get_protocol_info (pm, clib_net_to_host_u16 (t));
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/interface.c b/vnet/vnet/interface.c
new file mode 100644
index 00000000000..ffe0b672acd
--- /dev/null
+++ b/vnet/vnet/interface.c
@@ -0,0 +1,1051 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface.c: VNET interfaces/sub-interfaces
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/plugin/plugin.h>
+
+#define VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE (1 << 0)
+#define VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE (1 << 1)
+
+static clib_error_t *
+vnet_hw_interface_set_flags_helper (vnet_main_t * vnm, u32 hw_if_index, u32 flags,
+ u32 helper_flags);
+
+static clib_error_t *
+vnet_sw_interface_set_flags_helper (vnet_main_t * vnm, u32 sw_if_index, u32 flags,
+ u32 helper_flags);
+
+static clib_error_t *
+vnet_hw_interface_set_class_helper (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index, u32 redistribute);
+
+typedef struct {
+ /* Either sw or hw interface index. */
+ u32 sw_hw_if_index;
+
+ /* Flags. */
+ u32 flags;
+} vnet_sw_hw_interface_state_t;
+
+static void serialize_vec_vnet_sw_hw_interface_state (serialize_main_t * m, va_list * va)
+{
+ vnet_sw_hw_interface_state_t * s = va_arg (*va, vnet_sw_hw_interface_state_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++) {
+ serialize_integer (m, s[i].sw_hw_if_index, sizeof (s[i].sw_hw_if_index));
+ serialize_integer (m, s[i].flags, sizeof (s[i].flags));
+ }
+}
+
+static void unserialize_vec_vnet_sw_hw_interface_state (serialize_main_t * m, va_list * va)
+{
+ vnet_sw_hw_interface_state_t * s = va_arg (*va, vnet_sw_hw_interface_state_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++) {
+ unserialize_integer (m, &s[i].sw_hw_if_index, sizeof (s[i].sw_hw_if_index));
+ unserialize_integer (m, &s[i].flags, sizeof (s[i].flags));
+ }
+}
+
+static void serialize_vnet_sw_hw_interface_set_flags (serialize_main_t * m, va_list * va)
+{
+ vnet_sw_hw_interface_state_t * s = va_arg (*va, vnet_sw_hw_interface_state_t *);
+ serialize (m, serialize_vec_vnet_sw_hw_interface_state, s, 1);
+}
+
+static void unserialize_vnet_sw_interface_set_flags (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mc) = va_arg (*va, mc_main_t *);
+ vnet_sw_hw_interface_state_t s;
+
+ unserialize (m, unserialize_vec_vnet_sw_hw_interface_state, &s, 1);
+
+ vnet_sw_interface_set_flags_helper
+ (vnet_get_main(), s.sw_hw_if_index, s.flags,
+ /* helper_flags no redistribution */ 0);
+}
+
+static void unserialize_vnet_hw_interface_set_flags (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mc) = va_arg (*va, mc_main_t *);
+ vnet_sw_hw_interface_state_t s;
+
+ unserialize (m, unserialize_vec_vnet_sw_hw_interface_state, &s, 1);
+
+ vnet_hw_interface_set_flags_helper
+ (vnet_get_main(), s.sw_hw_if_index, s.flags,
+ /* helper_flags no redistribution */ 0);
+}
+
+MC_SERIALIZE_MSG (vnet_sw_interface_set_flags_msg, static) = {
+ .name = "vnet_sw_interface_set_flags",
+ .serialize = serialize_vnet_sw_hw_interface_set_flags,
+ .unserialize = unserialize_vnet_sw_interface_set_flags,
+};
+
+MC_SERIALIZE_MSG (vnet_hw_interface_set_flags_msg, static) = {
+ .name = "vnet_hw_interface_set_flags",
+ .serialize = serialize_vnet_sw_hw_interface_set_flags,
+ .unserialize = unserialize_vnet_hw_interface_set_flags,
+};
+
+void serialize_vnet_interface_state (serialize_main_t * m, va_list * va)
+{
+ vnet_main_t * vnm = va_arg (*va, vnet_main_t *);
+ vnet_sw_hw_interface_state_t * sts = 0, * st;
+ vnet_sw_interface_t * sif;
+ vnet_hw_interface_t * hif;
+ vnet_interface_main_t * im = &vnm->interface_main;
+
+ /* Serialize hardware interface classes since they may have changed.
+ Must do this before sending up/down flags. */
+ pool_foreach (hif, im->hw_interfaces, ({
+ vnet_hw_interface_class_t * hw_class = vnet_get_hw_interface_class (vnm, hif->hw_class_index);
+ serialize_cstring (m, hw_class->name);
+ }));
+
+ /* Send sw/hw interface state when non-zero. */
+ pool_foreach (sif, im->sw_interfaces, ({
+ if (sif->flags != 0)
+ {
+ vec_add2 (sts, st, 1);
+ st->sw_hw_if_index = sif->sw_if_index;
+ st->flags = sif->flags;
+ }
+ }));
+
+ vec_serialize (m, sts, serialize_vec_vnet_sw_hw_interface_state);
+
+ if (sts)
+ _vec_len (sts) = 0;
+
+ pool_foreach (hif, im->hw_interfaces, ({
+ if (hif->flags != 0)
+ {
+ vec_add2 (sts, st, 1);
+ st->sw_hw_if_index = hif->hw_if_index;
+ st->flags = hif->flags;
+ }
+ }));
+
+ vec_serialize (m, sts, serialize_vec_vnet_sw_hw_interface_state);
+
+ vec_free (sts);
+}
+
+void unserialize_vnet_interface_state (serialize_main_t * m, va_list * va)
+{
+ vnet_main_t * vnm = va_arg (*va, vnet_main_t *);
+ vnet_sw_hw_interface_state_t * sts = 0, * st;
+
+ /* First set interface hardware class. */
+ {
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_hw_interface_t * hif;
+ char * class_name;
+ uword * p;
+ clib_error_t * error;
+
+ pool_foreach (hif, im->hw_interfaces, ({
+ unserialize_cstring (m, &class_name);
+ p = hash_get_mem (im->hw_interface_class_by_name, class_name);
+ ASSERT (p != 0);
+ error = vnet_hw_interface_set_class_helper (vnm, hif->hw_if_index, p[0], /* redistribute */ 0);
+ if (error)
+ clib_error_report (error);
+ vec_free (class_name);
+ }));
+ }
+
+ vec_unserialize (m, &sts, unserialize_vec_vnet_sw_hw_interface_state);
+ vec_foreach (st, sts)
+ vnet_sw_interface_set_flags_helper (vnm, st->sw_hw_if_index, st->flags,
+ /* no distribute */ 0);
+ vec_free (sts);
+
+ vec_unserialize (m, &sts, unserialize_vec_vnet_sw_hw_interface_state);
+ vec_foreach (st, sts)
+ vnet_hw_interface_set_flags_helper (vnm, st->sw_hw_if_index, st->flags,
+ /* no distribute */ 0);
+ vec_free (sts);
+}
+
+static clib_error_t *
+call_elf_section_interface_callbacks (vnet_main_t * vnm, u32 if_index,
+ u32 flags,
+ _vnet_interface_function_list_elt_t *elt)
+{
+ clib_error_t * error = 0;
+
+ while (elt)
+ {
+ error = elt->fp(vnm, if_index, flags);
+ if (error)
+ return error;
+ elt = elt->next_interface_function;
+ }
+ return error;
+}
+
+static clib_error_t *
+call_hw_interface_add_del_callbacks (vnet_main_t * vnm, u32 hw_if_index, u32 is_create)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_hw_interface_class_t * hw_class = vnet_get_hw_interface_class (vnm, hi->hw_class_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+ clib_error_t * error = 0;
+
+ if (hw_class->interface_add_del_function
+ && (error = hw_class->interface_add_del_function (vnm, hw_if_index, is_create)))
+ return error;
+
+ if (dev_class->interface_add_del_function
+ && (error = dev_class->interface_add_del_function (vnm, hw_if_index, is_create)))
+ return error;
+
+ error = call_elf_section_interface_callbacks
+ (vnm, hw_if_index, is_create, vnm->hw_interface_add_del_functions);
+
+ return error;
+}
+
+static clib_error_t *
+call_sw_interface_add_del_callbacks (vnet_main_t * vnm, u32 sw_if_index, u32 is_create)
+{
+ return call_elf_section_interface_callbacks
+ (vnm, sw_if_index, is_create, vnm->sw_interface_add_del_functions);
+}
+
+#define VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE (1 << 0)
+#define VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE (1 << 1)
+
+static clib_error_t *
+vnet_hw_interface_set_flags_helper (vnet_main_t * vnm, u32 hw_if_index, u32 flags,
+ u32 helper_flags)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_hw_interface_class_t * hw_class = vnet_get_hw_interface_class (vnm, hi->hw_class_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+ vlib_main_t * vm = vnm->vlib_main;
+ u32 mask;
+ clib_error_t * error = 0;
+ u32 is_create = (helper_flags & VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE) != 0;
+
+ mask = (VNET_HW_INTERFACE_FLAG_LINK_UP | VNET_HW_INTERFACE_FLAG_DUPLEX_MASK |
+ VNET_HW_INTERFACE_FLAG_SPEED_MASK);
+ flags &= mask;
+
+ /* Call hardware interface add/del callbacks. */
+ if (is_create)
+ call_hw_interface_add_del_callbacks (vnm, hw_if_index, is_create);
+
+ /* Already in the desired state? */
+ if (! is_create && (hi->flags & mask) == flags)
+ goto done;
+
+ /* Some interface classes do not redistribute (e.g. are local). */
+ if (! dev_class->redistribute)
+ helper_flags &= ~ VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE;
+
+ if (vm->mc_main
+ && (helper_flags & VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE))
+ {
+ vnet_sw_hw_interface_state_t s;
+ s.sw_hw_if_index = hw_if_index;
+ s.flags = flags;
+ mc_serialize (vm->mc_main, &vnet_hw_interface_set_flags_msg, &s);
+ }
+
+ if ((hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP) !=
+ (flags & VNET_HW_INTERFACE_FLAG_LINK_UP))
+ {
+ /* Do hardware class (e.g. ethernet). */
+ if (hw_class->link_up_down_function
+ && (error = hw_class->link_up_down_function (vnm, hw_if_index,
+ flags)))
+ goto done;
+
+ error = call_elf_section_interface_callbacks
+ (vnm, hw_if_index, is_create, vnm->hw_interface_link_up_down_functions);
+
+ if (error)
+ goto done;
+ }
+
+ hi->flags &= ~mask;
+ hi->flags |= flags;
+
+ done:
+ return error;
+}
+
+static clib_error_t *
+vnet_sw_interface_set_flags_helper (vnet_main_t * vnm, u32 sw_if_index, u32 flags,
+ u32 helper_flags)
+{
+ vnet_sw_interface_t * si = vnet_get_sw_interface (vnm, sw_if_index);
+ vlib_main_t * vm = vnm->vlib_main;
+ u32 mask;
+ clib_error_t * error = 0;
+ u32 is_create = (helper_flags & VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE) != 0;
+
+ mask = VNET_SW_INTERFACE_FLAG_ADMIN_UP | VNET_SW_INTERFACE_FLAG_PUNT;
+ flags &= mask;
+
+ if (is_create)
+ {
+ error = call_sw_interface_add_del_callbacks (vnm, sw_if_index, is_create);
+ if (error)
+ goto done;
+
+ if (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+ {
+ /* Notify everyone when the interface is created as admin up */
+ error = call_elf_section_interface_callbacks (vnm, sw_if_index,
+ flags, vnm->sw_interface_admin_up_down_functions);
+ if (error)
+ goto done;
+ }
+ }
+ else
+ {
+ vnet_sw_interface_t * si_sup = si;
+
+ /* Check that super interface is in correct state. */
+ if (si->type == VNET_SW_INTERFACE_TYPE_SUB)
+ {
+ si_sup = vnet_get_sw_interface (vnm, si->sup_sw_if_index);
+
+ if (flags != (si_sup->flags & mask))
+ {
+ error = clib_error_return (0, "super-interface %U must be %U",
+ format_vnet_sw_interface_name, vnm, si_sup,
+ format_vnet_sw_interface_flags, flags);
+ goto done;
+ }
+ }
+
+ /* Already in the desired state? */
+ if ((si->flags & mask) == flags)
+ goto done;
+
+ /* Sub-interfaces of hardware interfaces that do no redistribute,
+ do not redistribute themselves. */
+ if (si_sup->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ {
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, si_sup->hw_if_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+ if (! dev_class->redistribute)
+ helper_flags &= ~ VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE;
+ }
+
+ if (vm->mc_main
+ && (helper_flags & VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE))
+ {
+ vnet_sw_hw_interface_state_t s;
+ s.sw_hw_if_index = sw_if_index;
+ s.flags = flags;
+ mc_serialize (vm->mc_main, &vnet_sw_interface_set_flags_msg, &s);
+ }
+
+ error = call_elf_section_interface_callbacks
+ (vnm, sw_if_index, flags, vnm->sw_interface_admin_up_down_functions);
+
+ if (error)
+ goto done;
+
+ if (si->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ {
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, si->hw_if_index);
+ vnet_hw_interface_class_t * hw_class = vnet_get_hw_interface_class (vnm, hi->hw_class_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+
+ if (dev_class->admin_up_down_function
+ && (error = dev_class->admin_up_down_function (vnm, si->hw_if_index, flags)))
+ goto done;
+
+ if (hw_class->admin_up_down_function
+ && (error = hw_class->admin_up_down_function (vnm, si->hw_if_index, flags)))
+ goto done;
+
+ /* Admin down implies link down. */
+ if (! (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+ && (hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP))
+ vnet_hw_interface_set_flags_helper (vnm, si->hw_if_index,
+ hi->flags &~ VNET_HW_INTERFACE_FLAG_LINK_UP,
+ helper_flags);
+ }
+ }
+
+ si->flags &= ~mask;
+ si->flags |= flags;
+
+ done:
+ return error;
+}
+
+clib_error_t *
+vnet_hw_interface_set_flags (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ return vnet_hw_interface_set_flags_helper
+ (vnm, hw_if_index, flags,
+ VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE);
+}
+
+clib_error_t *
+vnet_sw_interface_set_flags (vnet_main_t * vnm, u32 sw_if_index, u32 flags)
+{
+ return vnet_sw_interface_set_flags_helper
+ (vnm, sw_if_index, flags,
+ VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE);
+}
+
+static u32
+vnet_create_sw_interface_no_callbacks (vnet_main_t * vnm, vnet_sw_interface_t * template)
+{
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * sw;
+ u32 sw_if_index;
+
+ pool_get (im->sw_interfaces, sw);
+ sw_if_index = sw - im->sw_interfaces;
+
+ sw[0] = template[0];
+
+ sw->flags = 0;
+ sw->sw_if_index = sw_if_index;
+ if (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ sw->sup_sw_if_index = sw->sw_if_index;
+
+ /* Allocate counters for this interface. */
+ {
+ u32 i;
+
+ vnet_interface_counter_lock(im);
+
+ for (i = 0; i < vec_len (im->sw_if_counters); i++)
+ {
+ vlib_validate_simple_counter (&im->sw_if_counters[i], sw_if_index);
+ vlib_zero_simple_counter (&im->sw_if_counters[i], sw_if_index);
+ }
+
+ for (i = 0; i < vec_len (im->combined_sw_if_counters); i++)
+ {
+ vlib_validate_combined_counter (&im->combined_sw_if_counters[i],
+ sw_if_index);
+ vlib_zero_combined_counter (&im->combined_sw_if_counters[i],
+ sw_if_index);
+ }
+
+ vnet_interface_counter_unlock(im);
+ }
+
+ return sw_if_index;
+}
+
+clib_error_t *
+vnet_create_sw_interface (vnet_main_t * vnm, vnet_sw_interface_t * template, u32 * sw_if_index)
+{
+ clib_error_t * error;
+ vnet_hw_interface_t * hi;
+ vnet_device_class_t * dev_class;
+
+ hi = vnet_get_sup_hw_interface (vnm, template->sup_sw_if_index);
+ dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+
+ if (template->type == VNET_SW_INTERFACE_TYPE_SUB &&
+ dev_class->subif_add_del_function) {
+ error = dev_class->subif_add_del_function (vnm, hi->hw_if_index,
+ (struct vnet_sw_interface_t *) template, 1);
+ if (error)
+ return error;
+ }
+
+ *sw_if_index = vnet_create_sw_interface_no_callbacks (vnm, template);
+ error = vnet_sw_interface_set_flags_helper
+ (vnm, *sw_if_index, template->flags,
+ VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE);
+
+ if (error) {
+ // undo the work done by vnet_create_sw_interface_no_callbacks()
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * sw = pool_elt_at_index (im->sw_interfaces, *sw_if_index);
+ pool_put (im->sw_interfaces, sw);
+ }
+
+ return error;
+}
+
+void vnet_delete_sw_interface (vnet_main_t * vnm, u32 sw_if_index)
+{
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * sw = pool_elt_at_index (im->sw_interfaces, sw_if_index);
+
+ /* Bring down interface in case it is up. */
+ if (sw->flags != 0)
+ vnet_sw_interface_set_flags (vnm, sw_if_index, /* flags */ 0);
+
+ call_sw_interface_add_del_callbacks (vnm, sw_if_index, /* is_create */ 0);
+
+ pool_put (im->sw_interfaces, sw);
+}
+
+static void setup_tx_node (vlib_main_t * vm,
+ u32 node_index,
+ vnet_device_class_t * dev_class)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+
+ n->function = dev_class->tx_function;
+ n->format_trace = dev_class->format_tx_trace;
+ vlib_register_errors (vm, node_index,
+ dev_class->tx_function_n_errors,
+ dev_class->tx_function_error_strings);
+}
+
+static void setup_output_node (vlib_main_t * vm,
+ u32 node_index,
+ vnet_hw_interface_class_t * hw_class)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ n->format_buffer = hw_class->format_header;
+ n->unformat_buffer = hw_class->unformat_header;
+}
+
+/* Register an interface instance. */
+u32
+vnet_register_interface (vnet_main_t * vnm,
+ u32 dev_class_index,
+ u32 dev_instance,
+ u32 hw_class_index,
+ u32 hw_instance)
+{
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_hw_interface_t * hw;
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, dev_class_index);
+ vnet_hw_interface_class_t * hw_class = vnet_get_hw_interface_class (vnm, hw_class_index);
+ vlib_main_t * vm = vnm->vlib_main;
+ u32 hw_index;
+ char * tx_node_name, * output_node_name;
+
+ pool_get (im->hw_interfaces, hw);
+
+ hw_index = hw - im->hw_interfaces;
+ hw->hw_if_index = hw_index;
+
+ if (dev_class->format_device_name)
+ hw->name = format (0, "%U",
+ dev_class->format_device_name, dev_instance);
+ else if (hw_class->format_interface_name)
+ hw->name = format (0, "%U", hw_class->format_interface_name,
+ dev_instance);
+ else
+ hw->name = format (0, "%s%x", hw_class->name, dev_instance);
+
+ if (! im->hw_interface_by_name)
+ im->hw_interface_by_name = hash_create_vec (/* size */ 0,
+ sizeof (hw->name[0]),
+ sizeof (uword));
+
+ hash_set_mem (im->hw_interface_by_name, hw->name, hw_index);
+
+ /* Make hardware interface point to software interface. */
+ {
+ vnet_sw_interface_t sw;
+
+ memset (&sw, 0, sizeof (sw));
+ sw.type = VNET_SW_INTERFACE_TYPE_HARDWARE;
+ sw.hw_if_index = hw_index;
+ hw->sw_if_index = vnet_create_sw_interface_no_callbacks (vnm, &sw);
+ }
+
+ hw->dev_class_index = dev_class_index;
+ hw->dev_instance = dev_instance;
+ hw->hw_class_index = hw_class_index;
+ hw->hw_instance = hw_instance;
+
+ hw->max_rate_bits_per_sec = 0;
+ hw->min_packet_bytes = 0;
+ hw->per_packet_overhead_bytes = 0;
+ hw->max_l3_packet_bytes[VLIB_RX] = ~0;
+ hw->max_l3_packet_bytes[VLIB_TX] = ~0;
+
+ tx_node_name = (char *) format (0, "%v-tx", hw->name);
+ output_node_name = (char *) format (0, "%v-output", hw->name);
+
+ /* If we have previously deleted interface nodes, re-use them. */
+ if (vec_len (im->deleted_hw_interface_nodes) > 0)
+ {
+ vnet_hw_interface_nodes_t * hn;
+ vnet_interface_output_runtime_t * rt;
+
+ hn = vec_end (im->deleted_hw_interface_nodes) - 1;
+
+ hw->tx_node_index = hn->tx_node_index;
+ hw->output_node_index = hn->output_node_index;
+
+ vlib_node_rename (vm, hw->tx_node_index, "%v", tx_node_name);
+ vlib_node_rename (vm, hw->output_node_index, "%v", output_node_name);
+
+ rt = vlib_node_get_runtime_data (vm, hw->output_node_index);
+ ASSERT (rt->is_deleted == 1);
+ rt->is_deleted = 0;
+
+ _vec_len (im->deleted_hw_interface_nodes) -= 1;
+ }
+ else
+ {
+ vlib_node_registration_t r;
+ vnet_interface_output_runtime_t rt = {
+ .hw_if_index = hw_index,
+ .sw_if_index = hw->sw_if_index,
+ .dev_instance = hw->dev_instance,
+ .is_deleted = 0,
+ };
+
+ memset (&r, 0, sizeof (r));
+ r.type = VLIB_NODE_TYPE_INTERNAL;
+ r.runtime_data = &rt;
+ r.runtime_data_bytes = sizeof (rt);
+ r.scalar_size = 0;
+ r.vector_size = sizeof (u32);
+
+ r.flags = VLIB_NODE_FLAG_IS_OUTPUT;
+ r.name = tx_node_name;
+ r.function = dev_class->tx_function;
+
+ hw->tx_node_index = vlib_register_node (vm, &r);
+
+ vlib_node_add_named_next_with_slot (vm, hw->tx_node_index,
+ "error-drop",
+ VNET_INTERFACE_TX_NEXT_DROP);
+
+ r.flags = 0;
+ r.name = output_node_name;
+ r.function = dev_class->no_flatten_output_chains ?
+ vnet_interface_output_node_no_flatten :
+ vnet_interface_output_node;
+ r.format_trace = format_vnet_interface_output_trace;
+
+ {
+ static char * e[] = {
+ "interface is down",
+ "interface is deleted",
+ };
+
+ r.n_errors = ARRAY_LEN (e);
+ r.error_strings = e;
+ }
+
+ hw->output_node_index = vlib_register_node (vm, &r);
+
+#define _(sym,str) vlib_node_add_named_next_with_slot (vm, \
+ hw->output_node_index, str, \
+ VNET_INTERFACE_OUTPUT_NEXT_##sym);
+ foreach_intf_output_feat
+#undef _
+
+ vlib_node_add_named_next_with_slot (vm, hw->output_node_index,
+ "error-drop",
+ VNET_INTERFACE_OUTPUT_NEXT_DROP);
+ vlib_node_add_next_with_slot (vm, hw->output_node_index,
+ hw->tx_node_index,
+ VNET_INTERFACE_OUTPUT_NEXT_TX);
+ }
+
+ setup_output_node (vm, hw->output_node_index, hw_class);
+ setup_tx_node (vm, hw->tx_node_index, dev_class);
+
+ /* Call all up/down callbacks with zero flags when interface is created. */
+ vnet_sw_interface_set_flags_helper
+ (vnm, hw->sw_if_index, /* flags */ 0,
+ VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE);
+ vnet_hw_interface_set_flags_helper
+ (vnm, hw_index, /* flags */ 0,
+ VNET_INTERFACE_SET_FLAGS_HELPER_IS_CREATE);
+
+ return hw_index;
+}
+
+void vnet_delete_hw_interface (vnet_main_t * vnm, u32 hw_if_index)
+{
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_hw_interface_t * hw = vnet_get_hw_interface (vnm, hw_if_index);
+ vlib_main_t * vm = vnm->vlib_main;
+
+ /* If it is up, mark it down. */
+ if (hw->flags != 0)
+ vnet_hw_interface_set_flags (vnm, hw_if_index, /* flags */ 0);
+
+ /* Call delete callbacks. */
+ call_hw_interface_add_del_callbacks (vnm, hw_if_index, /* is_create */ 0);
+
+ /* Delete software interface corresponding to hardware interface. */
+ vnet_delete_sw_interface (vnm, hw->sw_if_index);
+
+ /* Delete any sub-interfaces. */
+ {
+ u32 id, sw_if_index;
+ hash_foreach (id, sw_if_index, hw->sub_interface_sw_if_index_by_id, ({
+ vnet_delete_sw_interface (vnm, sw_if_index);
+ }));
+ }
+
+ {
+ vnet_hw_interface_nodes_t * dn;
+ vnet_interface_output_runtime_t * rt = vlib_node_get_runtime_data (vm, hw->output_node_index);
+
+ /* Mark node runtime as deleted so output node (if called) will drop packets. */
+ rt->is_deleted = 1;
+
+ vlib_node_rename (vm, hw->output_node_index, "interface-%d-output-deleted", hw_if_index);
+ vlib_node_rename (vm, hw->tx_node_index, "interface-%d-tx-deleted", hw_if_index);
+ vec_add2 (im->deleted_hw_interface_nodes, dn, 1);
+ dn->tx_node_index = hw->tx_node_index;
+ dn->output_node_index = hw->output_node_index;
+ }
+
+ hash_unset_mem (im->hw_interface_by_name, hw->name);
+ vec_free (hw->name);
+
+ pool_put (im->hw_interfaces, hw);
+}
+
+static void serialize_vnet_hw_interface_set_class (serialize_main_t * m, va_list * va)
+{
+ u32 hw_if_index = va_arg (*va, u32);
+ char * hw_class_name = va_arg (*va, char *);
+ serialize_integer (m, hw_if_index, sizeof (hw_if_index));
+ serialize_cstring (m, hw_class_name);
+}
+
+static void unserialize_vnet_hw_interface_set_class (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mc) = va_arg (*va, mc_main_t *);
+ vnet_main_t * vnm = vnet_get_main();
+ u32 hw_if_index;
+ char * hw_class_name;
+ uword * p;
+ clib_error_t * error;
+
+ unserialize_integer (m, &hw_if_index, sizeof (hw_if_index));
+ unserialize_cstring (m, &hw_class_name);
+ p = hash_get (vnm->interface_main.hw_interface_class_by_name, hw_class_name);
+ ASSERT (p != 0);
+ error = vnet_hw_interface_set_class_helper (vnm, hw_if_index, p[0], /* redistribute */ 0);
+ if (error)
+ clib_error_report (error);
+}
+
+MC_SERIALIZE_MSG (vnet_hw_interface_set_class_msg, static) = {
+ .name = "vnet_hw_interface_set_class",
+ .serialize = serialize_vnet_hw_interface_set_class,
+ .unserialize = unserialize_vnet_hw_interface_set_class,
+};
+
+void vnet_hw_interface_init_for_class (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index, u32 hw_instance)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_hw_interface_class_t * hc = vnet_get_hw_interface_class (vnm, hw_class_index);
+
+ hi->hw_class_index = hw_class_index;
+ hi->hw_instance = hw_instance;
+ setup_output_node (vnm->vlib_main, hi->output_node_index, hc);
+}
+
+static clib_error_t *
+vnet_hw_interface_set_class_helper (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index, u32 redistribute)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_sw_interface_t * si = vnet_get_sw_interface (vnm, hi->sw_if_index);
+ vnet_hw_interface_class_t * old_class = vnet_get_hw_interface_class (vnm, hi->hw_class_index);
+ vnet_hw_interface_class_t * new_class = vnet_get_hw_interface_class (vnm, hw_class_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+ clib_error_t * error = 0;
+
+ /* New class equals old class? Nothing to do. */
+ if (hi->hw_class_index == hw_class_index)
+ return 0;
+
+ /* No need (and incorrect since admin up flag may be set) to do error checking when
+ receiving unserialize message. */
+ if (redistribute)
+ {
+ if (si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+ return clib_error_return (0, "%v must be admin down to change class from %s to %s",
+ hi->name, old_class->name, new_class->name);
+
+ /* Make sure interface supports given class. */
+ if ((new_class->is_valid_class_for_interface
+ && ! new_class->is_valid_class_for_interface (vnm, hw_if_index, hw_class_index))
+ || (dev_class ->is_valid_class_for_interface
+ && ! dev_class->is_valid_class_for_interface (vnm, hw_if_index, hw_class_index)))
+ return clib_error_return (0, "%v class cannot be changed from %s to %s",
+ hi->name, old_class->name, new_class->name);
+
+ if (vnm->vlib_main->mc_main)
+ {
+ mc_serialize (vnm->vlib_main->mc_main, &vnet_hw_interface_set_class_msg, hw_if_index, new_class->name);
+ return 0;
+ }
+ }
+
+ if (old_class->hw_class_change)
+ old_class->hw_class_change (vnm, hw_if_index, old_class->index, new_class->index);
+
+ vnet_hw_interface_init_for_class (vnm, hw_if_index, new_class->index, /* instance */ ~0);
+
+ if (new_class->hw_class_change)
+ new_class->hw_class_change (vnm, hw_if_index, old_class->index, new_class->index);
+
+ if (dev_class->hw_class_change)
+ dev_class->hw_class_change (vnm, hw_if_index, new_class->index);
+
+ return error;
+}
+
+clib_error_t *
+vnet_hw_interface_set_class (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index)
+{ return vnet_hw_interface_set_class_helper (vnm, hw_if_index, hw_class_index, /* redistribute */ 1); }
+
+static int
+vnet_hw_interface_rx_redirect_to_node_helper (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 node_index,
+ u32 redistribute)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_device_class_t * dev_class = vnet_get_device_class
+ (vnm, hi->dev_class_index);
+
+ if (redistribute)
+ {
+ /* $$$$ fixme someday maybe */
+ ASSERT(vnm->vlib_main->mc_main == 0);
+ }
+ if (dev_class->rx_redirect_to_node)
+ {
+ dev_class->rx_redirect_to_node (vnm, hw_if_index, node_index);
+ return 0;
+ }
+
+ return VNET_API_ERROR_UNIMPLEMENTED;
+}
+
+int vnet_hw_interface_rx_redirect_to_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index)
+{ return vnet_hw_interface_rx_redirect_to_node_helper (vnm, hw_if_index,
+ node_index,
+ 1 /* redistribute */); }
+
+word
+vnet_sw_interface_compare (vnet_main_t * vnm,
+ uword sw_if_index0, uword sw_if_index1)
+{
+ vnet_sw_interface_t * sup0 = vnet_get_sup_sw_interface (vnm, sw_if_index0);
+ vnet_sw_interface_t * sup1 = vnet_get_sup_sw_interface (vnm, sw_if_index1);
+ vnet_hw_interface_t * h0 = vnet_get_hw_interface (vnm, sup0->hw_if_index);
+ vnet_hw_interface_t * h1 = vnet_get_hw_interface (vnm, sup1->hw_if_index);
+
+ if (h0 != h1)
+ return vec_cmp (h0->name, h1->name);
+ return (word) h0->hw_instance - (word) h1->hw_instance;
+}
+
+word
+vnet_hw_interface_compare (vnet_main_t * vnm,
+ uword hw_if_index0, uword hw_if_index1)
+{
+ vnet_hw_interface_t * h0 = vnet_get_hw_interface (vnm, hw_if_index0);
+ vnet_hw_interface_t * h1 = vnet_get_hw_interface (vnm, hw_if_index1);
+
+ if (h0 != h1)
+ return vec_cmp (h0->name, h1->name);
+ return (word) h0->hw_instance - (word) h1->hw_instance;
+}
+
+clib_error_t *
+vnet_interface_init (vlib_main_t * vm)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vlib_buffer_t * b = 0;
+ vnet_buffer_opaque_t * o = 0;
+
+ /*
+ * Keep people from shooting themselves in the foot.
+ */
+ if (sizeof(b->opaque) != sizeof (vnet_buffer_opaque_t))
+ {
+#define _(a) if (sizeof(o->a) > sizeof (o->unused)) \
+ clib_warning \
+ ("FATAL: size of opaque union subtype %s is %d (max %d)", \
+ #a, sizeof(o->a), sizeof (o->unused));
+ foreach_buffer_opaque_union_subtype;
+#undef _
+
+ return clib_error_return
+ (0, "FATAL: size of vlib buffer opaque %d, size of vnet opaque %d",
+ sizeof(b->opaque), sizeof (vnet_buffer_opaque_t));
+ }
+
+ im->sw_if_counter_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES,
+ CLIB_CACHE_LINE_BYTES);
+ im->sw_if_counter_lock[0] = 1; /* should be no need */
+
+ vec_validate (im->sw_if_counters,
+ VNET_N_SIMPLE_INTERFACE_COUNTER - 1);
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_DROP].name = "drops";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_PUNT].name = "punts";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_IP4].name = "ip4";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_IP6].name = "ip6";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_RX_NO_BUF].name = "rx-no-buf";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_RX_MISS].name = "rx-miss";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_RX_ERROR].name = "rx-error";
+ im->sw_if_counters[VNET_INTERFACE_COUNTER_TX_ERROR].name = "tx-error";
+
+ vec_validate (im->combined_sw_if_counters,
+ VNET_N_COMBINED_INTERFACE_COUNTER - 1);
+ im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_RX].name = "rx";
+ im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_TX].name = "tx";
+
+ im->sw_if_counter_lock[0] = 0;
+
+ im->device_class_by_name = hash_create_string (/* size */ 0,
+ sizeof (uword));
+ {
+ vnet_device_class_t * c;
+
+ c = vnm->device_class_registrations;
+
+ while (c)
+ {
+ c->index = vec_len (im->device_classes);
+ hash_set_mem (im->device_class_by_name, c->name, c->index);
+ vec_add1 (im->device_classes, c[0]);
+ c = c->next_class_registration;
+ }
+ }
+
+ im->hw_interface_class_by_name = hash_create_string (/* size */ 0,
+ sizeof (uword));
+
+ im->sw_if_index_by_sup_and_sub = hash_create_mem (0, sizeof(u64),
+ sizeof (uword));
+ {
+ vnet_hw_interface_class_t * c;
+
+ c = vnm->hw_interface_class_registrations;
+
+ while (c)
+ {
+ c->index = vec_len (im->hw_interface_classes);
+ hash_set_mem (im->hw_interface_class_by_name, c->name, c->index);
+ vec_add1 (im->hw_interface_classes, c[0]);
+ c = c->next_class_registration;
+ }
+ }
+
+ {
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, vnet_interface_cli_init)))
+ return error;
+
+ return error;
+ }
+}
+
+VLIB_INIT_FUNCTION (vnet_interface_init);
+
+/* Kludge to renumber interface names [only!] */
+int vnet_interface_name_renumber (u32 sw_if_index, u32 new_show_dev_instance)
+{
+ int rv;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_hw_interface_t * hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ vnet_device_class_t * dev_class = vnet_get_device_class
+ (vnm, hi->dev_class_index);
+
+ if (dev_class->name_renumber == 0 || dev_class->format_device_name == 0)
+ return VNET_API_ERROR_UNIMPLEMENTED;
+
+ rv = dev_class->name_renumber (hi, new_show_dev_instance);
+
+ if (rv)
+ return rv;
+
+ hash_unset_mem (im->hw_interface_by_name, hi->name);
+ vec_free (hi->name);
+ /* Use the mapping we set up to call it Ishmael */
+ hi->name = format (0, "%U", dev_class->format_device_name,
+ hi->dev_instance);
+
+ hash_set_mem (im->hw_interface_by_name, hi->name, hi->hw_if_index);
+ return rv;
+}
+
+int vnet_interface_add_del_feature(vnet_main_t * vnm,
+ vlib_main_t *vm,
+ u32 sw_if_index,
+ intf_output_feat_t feature,
+ int is_add)
+{
+ vnet_sw_interface_t * sw;
+
+ sw = vnet_get_sw_interface(vnm, sw_if_index);
+
+ if (is_add) {
+
+ sw->output_feature_bitmap |= (1 << feature);
+ sw->output_feature_bitmap |= (1<< INTF_OUTPUT_FEAT_DONE);
+
+ } else { /* delete */
+
+ sw->output_feature_bitmap &= ~(1<<feature);
+ if (sw->output_feature_bitmap == (1 << INTF_OUTPUT_FEAT_DONE))
+ sw->output_feature_bitmap = 0;
+
+ }
+ return 0;
+}
+
diff --git a/vnet/vnet/interface.h b/vnet/vnet/interface.h
new file mode 100644
index 00000000000..f441f9e1027
--- /dev/null
+++ b/vnet/vnet/interface.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface.h: VNET interfaces/sub-interfaces
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_interface_h
+#define included_vnet_interface_h
+
+#include <vnet/unix/pcap.h>
+
+struct vnet_main_t;
+struct vnet_hw_interface_t;
+struct vnet_sw_interface_t;
+
+/* Interface up/down callback. */
+typedef clib_error_t * (vnet_interface_function_t)
+ (struct vnet_main_t * vnm, u32 if_index, u32 flags);
+
+/* Sub-interface add/del callback. */
+typedef clib_error_t * (vnet_subif_add_del_function_t)
+ (struct vnet_main_t * vnm, u32 if_index,
+ struct vnet_sw_interface_t * template,
+ int is_add);
+
+typedef struct _vnet_interface_function_list_elt {
+ struct _vnet_interface_function_list_elt * next_interface_function;
+ clib_error_t * (*fp) (struct vnet_main_t * vnm, u32 if_index, u32 flags);
+} _vnet_interface_function_list_elt_t;
+
+#define _VNET_INTERFACE_FUNCTION_DECL(f,tag) \
+ \
+static void __vnet_interface_function_init_##tag##_##f (void) \
+ __attribute__((__constructor__)) ; \
+ \
+static void __vnet_interface_function_init_##tag##_##f (void) \
+{ \
+ vnet_main_t * vnm = vnet_get_main(); \
+ static _vnet_interface_function_list_elt_t init_function; \
+ init_function.next_interface_function = vnm->tag##_functions; \
+ vnm->tag##_functions = &init_function; \
+ init_function.fp = (void *) &f; \
+}
+
+#define VNET_HW_INTERFACE_ADD_DEL_FUNCTION(f) \
+ _VNET_INTERFACE_FUNCTION_DECL(f,hw_interface_add_del)
+#define VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION(f) \
+ _VNET_INTERFACE_FUNCTION_DECL(f,hw_interface_link_up_down)
+#define VNET_SW_INTERFACE_ADD_DEL_FUNCTION(f) \
+ _VNET_INTERFACE_FUNCTION_DECL(f,sw_interface_add_del)
+#define VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION(f) \
+ _VNET_INTERFACE_FUNCTION_DECL(f,sw_interface_admin_up_down)
+
+/* A class of hardware interface devices. */
+typedef struct _vnet_device_class {
+ /* Index into main vector. */
+ u32 index;
+
+ /* Device name (e.g. "FOOBAR 1234a"). */
+ char * name;
+
+ /* Function to call when hardware interface is added/deleted. */
+ vnet_interface_function_t * interface_add_del_function;
+
+ /* Function to bring device administratively up/down. */
+ vnet_interface_function_t * admin_up_down_function;
+
+ /* Function to call when sub-interface is added/deleted */
+ vnet_subif_add_del_function_t * subif_add_del_function;
+
+ /* Redistribute flag changes/existence of this interface class. */
+ u32 redistribute;
+
+ /* Transmit function. */
+ vlib_node_function_t * tx_function;
+
+ /* Error strings indexed by error code for this node. */
+ char ** tx_function_error_strings;
+
+ /* Number of error codes used by this node. */
+ u32 tx_function_n_errors;
+
+ /* Renumber device name [only!] support, a control-plane kludge */
+ int (*name_renumber) (struct vnet_hw_interface_t * hi, u32 new_dev_instance);
+
+ /* Format device instance as name. */
+ format_function_t * format_device_name;
+
+ /* Parse function for device name. */
+ unformat_function_t * unformat_device_name;
+
+ /* Format device verbosely for this class. */
+ format_function_t * format_device;
+
+ /* Trace buffer format for TX function. */
+ format_function_t * format_tx_trace;
+
+ /* Function to clear hardware counters for device. */
+ void (* clear_counters) (u32 dev_class_instance);
+
+ uword (* is_valid_class_for_interface) (struct vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index);
+
+ /* Called when hardware class of an interface changes. */
+ void ( * hw_class_change) (struct vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 new_hw_class_index);
+
+ /* Called to redirect traffic from a specific interface instance */
+ void (* rx_redirect_to_node) (struct vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 node_index);
+
+ /* Link-list of all device classes set up by constructors created below */
+ struct _vnet_device_class * next_class_registration;
+
+ /* Do not splice vnet_interface_output_node into TX path */
+ u8 no_flatten_output_chains;
+
+} vnet_device_class_t;
+
+#define VNET_DEVICE_CLASS(x,...) \
+ __VA_ARGS__ vnet_device_class_t x; \
+static void __vnet_add_device_class_registration_##x (void) \
+ __attribute__((__constructor__)) ; \
+static void __vnet_add_device_class_registration_##x (void) \
+{ \
+ vnet_main_t * vnm = vnet_get_main(); \
+ x.next_class_registration = vnm->device_class_registrations; \
+ vnm->device_class_registrations = &x; \
+} \
+__VA_ARGS__ vnet_device_class_t x
+
+/* Layer-2 (e.g. Ethernet) interface class. */
+typedef struct _vnet_hw_interface_class {
+ /* Index into main vector. */
+ u32 index;
+
+ /* Class name (e.g. "Ethernet"). */
+ char * name;
+
+ /* Function to call when hardware interface is added/deleted. */
+ vnet_interface_function_t * interface_add_del_function;
+
+ /* Function to bring interface administratively up/down. */
+ vnet_interface_function_t * admin_up_down_function;
+
+ /* Function to call when link state changes. */
+ vnet_interface_function_t * link_up_down_function;
+
+ /* Format function to display interface name. */
+ format_function_t * format_interface_name;
+
+ /* Format function to display interface address. */
+ format_function_t * format_address;
+
+ /* Format packet header for this interface class. */
+ format_function_t * format_header;
+
+ /* Format device verbosely for this class. */
+ format_function_t * format_device;
+
+ /* Parser for hardware (e.g. ethernet) address. */
+ unformat_function_t * unformat_hw_address;
+
+ /* Parser for packet header for e.g. rewrite string. */
+ unformat_function_t * unformat_header;
+
+ /* Forms adjacency for given l3 packet type and destination address.
+ Returns number of bytes in adjacency. */
+ uword (* set_rewrite) (struct vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_packet_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes);
+
+ uword (* is_valid_class_for_interface) (struct vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index);
+
+ /* Called when hw interface class is changed and old hardware instance
+ may want to be deleted. */
+ void (* hw_class_change) (struct vnet_main_t * vnm, u32 hw_if_index, u32 old_class_index, u32 new_class_index);
+
+ /* List of hw interface classes, built by constructors */
+ struct _vnet_hw_interface_class * next_class_registration;
+
+} vnet_hw_interface_class_t;
+
+#define VNET_HW_INTERFACE_CLASS(x,...) \
+ __VA_ARGS__ vnet_hw_interface_class_t x; \
+static void __vnet_add_hw_interface_class_registration_##x (void) \
+ __attribute__((__constructor__)) ; \
+static void __vnet_add_hw_interface_class_registration_##x (void) \
+{ \
+ vnet_main_t * vnm = vnet_get_main(); \
+ x.next_class_registration = vnm->hw_interface_class_registrations; \
+ vnm->hw_interface_class_registrations = &x; \
+} \
+__VA_ARGS__ vnet_hw_interface_class_t x
+
+/* Hardware-interface. This corresponds to a physical wire
+ that packets flow over. */
+typedef struct vnet_hw_interface_t {
+ /* Interface name. */
+ u8 * name;
+
+ u32 flags;
+ /* Hardware link state is up. */
+#define VNET_HW_INTERFACE_FLAG_LINK_UP (1 << 0)
+ /* Hardware duplex state */
+#define VNET_HW_INTERFACE_FLAG_DUPLEX_SHIFT 1
+#define VNET_HW_INTERFACE_FLAG_HALF_DUPLEX (1 << 1)
+#define VNET_HW_INTERFACE_FLAG_FULL_DUPLEX (1 << 2)
+#define VNET_HW_INTERFACE_FLAG_DUPLEX_MASK \
+ (VNET_HW_INTERFACE_FLAG_HALF_DUPLEX | \
+ VNET_HW_INTERFACE_FLAG_FULL_DUPLEX)
+
+ /* Hardware link speed */
+#define VNET_HW_INTERFACE_FLAG_SPEED_SHIFT 3
+#define VNET_HW_INTERFACE_FLAG_SPEED_10M (1 << 3)
+#define VNET_HW_INTERFACE_FLAG_SPEED_100M (1 << 4)
+#define VNET_HW_INTERFACE_FLAG_SPEED_1G (1 << 5)
+#define VNET_HW_INTERFACE_FLAG_SPEED_10G (1 << 6)
+#define VNET_HW_INTERFACE_FLAG_SPEED_40G (1 << 7)
+#define VNET_HW_INTERFACE_FLAG_SPEED_100G (1 << 8)
+#define VNET_HW_INTERFACE_FLAG_SPEED_MASK \
+ (VNET_HW_INTERFACE_FLAG_SPEED_10M | \
+ VNET_HW_INTERFACE_FLAG_SPEED_100M | \
+ VNET_HW_INTERFACE_FLAG_SPEED_1G | \
+ VNET_HW_INTERFACE_FLAG_SPEED_10G | \
+ VNET_HW_INTERFACE_FLAG_SPEED_40G | \
+ VNET_HW_INTERFACE_FLAG_SPEED_100G)
+
+ /* Hardware address as vector. Zero (e.g. zero-length vector) if no
+ address for this class (e.g. PPP). */
+ u8 * hw_address;
+
+ /* Interface is up as far as software is concerned. */
+ /* NAME.{output,tx} nodes for this interface. */
+ u32 output_node_index, tx_node_index;
+
+ /* (dev_class, dev_instance) uniquely identifies hw interface. */
+ u32 dev_class_index;
+ u32 dev_instance;
+
+ /* (hw_class, hw_instance) uniquely identifies hw interface. */
+ u32 hw_class_index;
+ u32 hw_instance;
+
+ /* Hardware index for this hardware interface. */
+ u32 hw_if_index;
+
+ /* Software index for this hardware interface. */
+ u32 sw_if_index;
+
+ /* Maximum transmit rate for this interface in bits/sec. */
+ f64 max_rate_bits_per_sec;
+
+ /* Smallest packet size for this interface. */
+ u32 min_packet_bytes;
+
+ /* Largest packet size for this interface. */
+ u32 max_packet_bytes;
+
+ /* Number of extra bytes that go on the wire.
+ Packet length on wire
+ = max (length + per_packet_overhead_bytes, min_packet_bytes). */
+ u32 per_packet_overhead_bytes;
+
+ /* Receive and transmit layer 3 packet size limits (MRU/MTU). */
+ u32 max_l3_packet_bytes[VLIB_N_RX_TX];
+
+ /* Hash table mapping sub interface id to sw_if_index. */
+ uword * sub_interface_sw_if_index_by_id;
+
+ /* Count of number of L2 subinterfaces */
+ u32 l2_if_count;
+} vnet_hw_interface_t;
+
+typedef enum {
+ /* A hw interface. */
+ VNET_SW_INTERFACE_TYPE_HARDWARE,
+
+ /* A sub-interface. */
+ VNET_SW_INTERFACE_TYPE_SUB,
+} vnet_sw_interface_type_t;
+
+typedef struct {
+ // Subinterface ID. A number 0-N to uniquely identify this subinterface under the
+ // main (parent?) interface
+ u32 id;
+
+ // Classification data. Used to associate packet header with subinterface.
+ struct {
+ u16 outer_vlan_id;
+ u16 inner_vlan_id;
+ union {
+ u16 raw_flags;
+ struct {
+ u16 no_tags:1;
+ u16 one_tag:1;
+ u16 two_tags:1;
+ u16 dot1ad:1; // 0 = dot1q, 1=dot1ad
+ u16 exact_match:1;
+ u16 default_sub:1;
+ u16 outer_vlan_id_any:1;
+ u16 inner_vlan_id_any:1;
+ } flags;
+ };
+ } eth;
+} vnet_sub_interface_t;
+
+/* Software-interface. This corresponds to a Ethernet VLAN, ATM vc, a
+ tunnel, etc. Configuration (e.g. IP address) gets attached to
+ software interface. */
+typedef struct {
+ vnet_sw_interface_type_t type : 16;
+
+ u16 flags;
+ /* Interface is "up" meaning adminstratively up.
+ Up in the sense of link state being up is maintained by hardware interface. */
+#define VNET_SW_INTERFACE_FLAG_ADMIN_UP (1 << 0)
+
+ /* Interface is disabled for forwarding: punt all traffic to slow-path. */
+#define VNET_SW_INTERFACE_FLAG_PUNT (1 << 1)
+
+#define VNET_SW_INTERFACE_FLAG_PROXY_ARP (1 << 2)
+
+#define VNET_SW_INTERFACE_FLAG_UNNUMBERED (1 << 3)
+
+ /* Index for this interface. */
+ u32 sw_if_index;
+
+ /* Software interface index of super-interface;
+ equal to sw_if_index if this interface is not a
+ sub-interface. */
+ u32 sup_sw_if_index;
+
+ /* this swif is unnumbered, use addresses on unnumbered_sw_if_index... */
+ u32 unnumbered_sw_if_index;
+
+ u32 link_speed;
+
+ u32 output_feature_bitmap;
+
+ union {
+ /* VNET_SW_INTERFACE_TYPE_HARDWARE. */
+ u32 hw_if_index;
+
+ /* VNET_SW_INTERFACE_TYPE_SUB. */
+ vnet_sub_interface_t sub;
+
+ /* SW interfaces are sorted by type and key. */
+ // u32 sort_key;
+ };
+} vnet_sw_interface_t;
+
+typedef enum {
+ /* Simple counters. */
+ VNET_INTERFACE_COUNTER_DROP = 0,
+ VNET_INTERFACE_COUNTER_PUNT = 1,
+ VNET_INTERFACE_COUNTER_IP4 = 2,
+ VNET_INTERFACE_COUNTER_IP6 = 3,
+ VNET_INTERFACE_COUNTER_RX_NO_BUF = 4,
+ VNET_INTERFACE_COUNTER_RX_MISS = 5,
+ VNET_INTERFACE_COUNTER_RX_ERROR = 6,
+ VNET_INTERFACE_COUNTER_TX_ERROR = 7,
+ VNET_N_SIMPLE_INTERFACE_COUNTER = 8,
+ /* Combined counters. */
+ VNET_INTERFACE_COUNTER_RX = 0,
+ VNET_INTERFACE_COUNTER_TX = 1,
+ VNET_N_COMBINED_INTERFACE_COUNTER = 2,
+} vnet_interface_counter_type_t;
+
+typedef struct {
+ u32 output_node_index;
+ u32 tx_node_index;
+} vnet_hw_interface_nodes_t;
+
+typedef struct {
+ /* Hardware interfaces. */
+ vnet_hw_interface_t * hw_interfaces;
+
+ /* Hash table mapping HW interface name to index. */
+ uword * hw_interface_by_name;
+
+ /* Vectors if hardware interface classes and device classes. */
+ vnet_hw_interface_class_t * hw_interface_classes;
+ vnet_device_class_t * device_classes;
+
+ /* Hash table mapping name to hw interface/device class. */
+ uword * hw_interface_class_by_name;
+ uword * device_class_by_name;
+
+ /* Software interfaces. */
+ vnet_sw_interface_t * sw_interfaces;
+
+ /* Hash table mapping sub intfc sw_if_index by sup sw_if_index and sub id */
+ uword * sw_if_index_by_sup_and_sub;
+
+ /* Software interface counters both simple and combined
+ packet and byte counters. */
+ volatile u32 *sw_if_counter_lock;
+ vlib_simple_counter_main_t * sw_if_counters;
+ vlib_combined_counter_main_t * combined_sw_if_counters;
+
+ vnet_hw_interface_nodes_t * deleted_hw_interface_nodes;
+
+ /* pcap drop tracing */
+ int drop_pcap_enable;
+ pcap_main_t pcap_main;
+ u8 * pcap_filename;
+ u32 pcap_sw_if_index;
+ u32 pcap_pkts_to_capture;
+ uword * pcap_drop_filter_hash;
+
+} vnet_interface_main_t;
+
+static inline void vnet_interface_counter_lock (vnet_interface_main_t *im)
+{
+ if (im->sw_if_counter_lock)
+ while (__sync_lock_test_and_set (im->sw_if_counter_lock, 1))
+ /* zzzz */ ;
+}
+static inline void vnet_interface_counter_unlock (vnet_interface_main_t *im)
+{
+ if (im->sw_if_counter_lock)
+ *im->sw_if_counter_lock = 0;
+}
+
+void vnet_pcap_drop_trace_filter_add_del (u32 error_index, int is_add);
+
+int vnet_interface_name_renumber (u32 sw_if_index, u32 new_show_dev_instance);
+
+
+/*
+ * Output features
+ */
+
+#define foreach_intf_output_feat \
+ _(IPSEC, "ipsec-output")
+
+// Feature bitmap positions
+typedef enum {
+#define _(sym,str) INTF_OUTPUT_FEAT_##sym,
+ foreach_intf_output_feat
+#undef _
+ INTF_OUTPUT_N_FEAT,
+} intf_output_feat_t;
+
+/* flag that we are done with feature path */
+#define INTF_OUTPUT_FEAT_DONE INTF_OUTPUT_N_FEAT
+
+int vnet_interface_add_del_feature(struct vnet_main_t * vnm, vlib_main_t * vm,
+ u32 sw_if_index,
+ intf_output_feat_t feature, int is_add);
+
+#endif /* included_vnet_interface_h */
diff --git a/vnet/vnet/interface_cli.c b/vnet/vnet/interface_cli.c
new file mode 100644
index 00000000000..8d986527fc6
--- /dev/null
+++ b/vnet/vnet/interface_cli.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface_cli.c: interface CLI
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+
+static int compare_interface_names (void *a1, void *a2)
+{
+ u32 * hi1 = a1;
+ u32 * hi2 = a2;
+
+ return vnet_hw_interface_compare (vnet_get_main(), *hi1, *hi2);
+}
+
+static clib_error_t *
+show_or_clear_hw_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_hw_interface_t * hi;
+ u32 hw_if_index, * hw_if_indices = 0;
+ int i, verbose = 1, is_show;
+
+ is_show = strstr (cmd->path, "show") != 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* See if user wants to show a specific interface. */
+ if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
+ {
+ vec_add1 (hw_if_indices, hw_if_index);
+ /* Implies verbose. */
+ verbose = 1;
+ }
+
+ else if (unformat (input, "verbose"))
+ verbose = 1;
+
+ else if (unformat (input, "detail"))
+ verbose = 2;
+
+ else if (unformat (input, "brief"))
+ verbose = 0;
+
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ /* Gather interfaces. */
+ if (vec_len (hw_if_indices) == 0)
+ pool_foreach (hi, im->hw_interfaces,
+ vec_add1 (hw_if_indices, hi - im->hw_interfaces));
+
+ if (is_show)
+ {
+ /* Sort by name. */
+ vec_sort_with_function (hw_if_indices, compare_interface_names);
+
+ vlib_cli_output (vm, "%U\n", format_vnet_hw_interface, vnm, 0, verbose);
+ for (i = 0; i < vec_len (hw_if_indices); i++)
+ {
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ vlib_cli_output (vm, "%U\n", format_vnet_hw_interface, vnm, hi, verbose);
+ }
+ }
+ else
+ {
+ for (i = 0; i < vec_len (hw_if_indices); i++)
+ {
+ vnet_device_class_t * dc;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_indices[i]);
+ dc = vec_elt_at_index (im->device_classes, hi->dev_class_index);
+
+ if (dc->clear_counters)
+ dc->clear_counters (hi->dev_instance);
+ }
+ }
+
+ done:
+ vec_free (hw_if_indices);
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_hw_interfaces_command, static) = {
+ .path = "show hardware-interfaces",
+ .short_help = "show hardware-interfaces [verbose|brief] [<if-name1> <if-name2> ...]",
+ .function = show_or_clear_hw_interfaces,
+};
+
+VLIB_CLI_COMMAND (clear_hw_interface_counters_command, static) = {
+ .path = "clear hardware-interfaces",
+ .short_help = "Clear hardware interfaces statistics",
+ .function = show_or_clear_hw_interfaces,
+};
+
+static int sw_interface_name_compare (void *a1, void *a2)
+{
+ vnet_sw_interface_t *si1 = a1;
+ vnet_sw_interface_t *si2 = a2;
+
+ return vnet_sw_interface_compare (vnet_get_main(),
+ si1->sw_if_index, si2->sw_if_index);
+}
+
+static clib_error_t *
+show_sw_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * si, * sorted_sis = 0;
+ u8 show_addresses = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ u32 sw_if_index;
+
+ /* See if user wants to show specific interface */
+ if (unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ si = pool_elt_at_index (im->sw_interfaces, sw_if_index);
+ vec_add1 (sorted_sis, si[0]);
+ }
+
+ else if (unformat (input, "address") || unformat (input, "addr"))
+ show_addresses = 1;
+
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ if (!show_addresses)
+ vlib_cli_output (vm, "%U\n", format_vnet_sw_interface, vnm, 0);
+
+ if (vec_len (sorted_sis) == 0) /* Get all interfaces */
+ {
+ /* Gather interfaces. */
+ sorted_sis = vec_new (vnet_sw_interface_t, pool_elts (im->sw_interfaces));
+ _vec_len (sorted_sis) = 0;
+ pool_foreach (si, im->sw_interfaces, ({ vec_add1 (sorted_sis, si[0]); }));
+
+ /* Sort by name. */
+ vec_sort_with_function (sorted_sis, sw_interface_name_compare);
+ }
+
+ if (show_addresses)
+ {
+ vec_foreach (si, sorted_sis)
+ {
+ l2input_main_t * l2m = &l2input_main;
+ ip4_main_t * im4 = &ip4_main;
+ ip6_main_t * im6 = &ip6_main;
+ ip_lookup_main_t * lm4 = &im4->lookup_main;
+ ip_lookup_main_t * lm6 = &im6->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip4_address_t * r4;
+ ip6_address_t * r6;
+ u32 fib_index4 = 0, fib_index6 = 0;
+ ip4_fib_t * fib4;
+ ip6_fib_t * fib6;
+ l2_input_config_t * config;
+
+ if (vec_len (im4->fib_index_by_sw_if_index) > si->sw_if_index)
+ fib_index4 = vec_elt (im4->fib_index_by_sw_if_index,
+ si->sw_if_index);
+
+ if (vec_len (im6->fib_index_by_sw_if_index) > si->sw_if_index)
+ fib_index6 = vec_elt (im6->fib_index_by_sw_if_index,
+ si->sw_if_index);
+
+ fib4 = vec_elt_at_index (im4->fibs, fib_index4);
+ fib6 = vec_elt_at_index (im6->fibs, fib_index6);
+
+ if (si->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED)
+ vlib_cli_output
+ (vm, "%U (%s): \n unnumbered, use %U",
+ format_vnet_sw_if_index_name,
+ vnm, si->sw_if_index,
+ (si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? "up" : "dn",
+ format_vnet_sw_if_index_name,
+ vnm, si->unnumbered_sw_if_index);
+
+ else
+ {
+ vlib_cli_output (vm, "%U (%s):",
+ format_vnet_sw_if_index_name,
+ vnm, si->sw_if_index,
+ (si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)
+ ? "up" : "dn");
+ }
+
+ /* Display any L2 addressing info */
+ vec_validate(l2m->configs, si->sw_if_index);
+ config = vec_elt_at_index(l2m->configs, si->sw_if_index);
+ if (config->bridge)
+ {
+ u32 bd_id = l2input_main.bd_configs[config->bd_index].bd_id;
+ vlib_cli_output (vm, " l2 bridge bd_id %d%s%d", bd_id,
+ config->bvi ? " bvi shg " : " shg ", config->shg);
+ }
+ else if (config->xconnect)
+ {
+ vlib_cli_output (vm, " l2 xconnect %U",
+ format_vnet_sw_if_index_name,
+ vnm, config->output_sw_if_index);
+ }
+
+ /* Display any IP4 addressing info */
+ foreach_ip_interface_address (lm4, ia, si->sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ r4 = ip_interface_address_get_address (lm4, ia);
+ if (fib4->table_id)
+ {
+ vlib_cli_output (vm, " %U/%d table %d",
+ format_ip4_address, r4,
+ ia->address_length,
+ fib4->table_id);
+ }
+ else
+ {
+ vlib_cli_output (vm, " %U/%d",
+ format_ip4_address, r4,
+ ia->address_length);
+ }
+ }));
+
+ /* Display any IP6 addressing info */
+ foreach_ip_interface_address (lm6, ia, si->sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ r6 = ip_interface_address_get_address (lm6, ia);
+ if (fib6->table_id)
+ {
+ vlib_cli_output (vm, " %U/%d table %d",
+ format_ip6_address, r6,
+ ia->address_length,
+ fib6->table_id);
+ }
+ else
+ {
+ vlib_cli_output (vm, " %U/%d",
+ format_ip6_address, r6,
+ ia->address_length);
+ }
+ }));
+ }
+ }
+ else
+ {
+ vec_foreach (si, sorted_sis)
+ {
+ vlib_cli_output (vm, "%U\n", format_vnet_sw_interface, vnm, si);
+ }
+ }
+
+ done:
+ vec_free (sorted_sis);
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_sw_interfaces_command, static) = {
+ .path = "show interfaces",
+ .short_help = "show interfaces [address|addr] [<if-name1> <if-name2> ...]",
+ .function = show_sw_interfaces,
+};
+
+/* Root of all interface commands. */
+VLIB_CLI_COMMAND (vnet_cli_interface_command, static) = {
+ .path = "interface",
+ .short_help = "Interface commands",
+};
+
+VLIB_CLI_COMMAND (vnet_cli_set_interface_command, static) = {
+ .path = "set interface",
+ .short_help = "Interface commands",
+};
+
+static clib_error_t *
+clear_interface_counters (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vlib_simple_counter_main_t * sm;
+ vlib_combined_counter_main_t * cm;
+ static vnet_main_t ** my_vnet_mains;
+ int i, j, n_counters;
+
+ vec_reset_length (my_vnet_mains);
+
+ for (i = 0; i < vec_len (vnet_mains); i++)
+ {
+ if (vnet_mains[i])
+ vec_add1 (my_vnet_mains, vnet_mains[i]);
+ }
+
+ if (vec_len (vnet_mains) == 0)
+ vec_add1 (my_vnet_mains, vnm);
+
+ n_counters = vec_len (im->combined_sw_if_counters);
+
+ for (j = 0; j < n_counters; j++)
+ {
+ for (i = 0; i < vec_len(my_vnet_mains); i++)
+ {
+ im = &my_vnet_mains[i]->interface_main;
+ cm = im->combined_sw_if_counters + j;
+ vlib_clear_combined_counters (cm);
+ }
+ }
+
+ n_counters = vec_len (im->sw_if_counters);
+
+ for (j = 0; j < n_counters; j++)
+ {
+ for (i = 0; i < vec_len(my_vnet_mains); i++)
+ {
+ im = &my_vnet_mains[i]->interface_main;
+ sm = im->sw_if_counters + j;
+ vlib_clear_simple_counters (sm);
+ }
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_interface_counters_command, static) = {
+ .path = "clear interfaces",
+ .short_help = "Clear interfaces statistics",
+ .function = clear_interface_counters,
+};
+
+// The following subinterface syntax is supported. The first two are for
+// backwards compatability:
+//
+// <intf-name> <id>
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a single dot1q vlan with vlan id <id> and exact-match semantics.
+//
+// <intf-name> <min_id>-<max_id>
+// - a set of the above subinterfaces, repeating for each id
+// in the range <min_id> to <max_id>
+//
+// In the following, exact-match semantics (i.e. the number of vlan tags on the
+// packet must match the number of tags in the configuration) are used only if
+// the keyword exact-match is present. Non-exact match is the default.
+//
+// <intf-name> <id> dot1q <outer_id> [exact-match]
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a single dot1q vlan with vlan id <outer_id>.
+//
+// <intf-name> <id> dot1q any [exact-match]
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a single dot1q vlan with any vlan id.
+//
+// <intf-name> <id> dot1q <outer_id> inner-dot1q <inner_id> [exact-match]
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a double dot1q vlan with outer vlan id <outer_id> and inner vlan id
+// <inner_id>.
+//
+// <intf-name> <id> dot1q <outer_id> inner-dot1q any [exact-match]
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a double dot1q vlan with outer vlan id <id> and any inner vlan id.
+//
+// <intf-name> <id> dot1q any inner-dot1q any [exact-match]
+//
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a double dot1q vlan with any outer vlan id and any inner vlan id.
+//
+// For each of the above CLI, there is a duplicate that uses the keyword
+// "dot1ad" in place of the first "dot1q". These interfaces use ethertype
+// 0x88ad in place of 0x8100 for the outer ethertype. Note that for double-
+// tagged packets the inner ethertype is always 0x8100. Also note that
+// the dot1q and dot1ad naming spaces are independent, so it is legal to
+// have both "Gig3/0/0.1 dot1q 100" and "Gig3/0/0.2 dot1ad 100". For example:
+//
+// <intf-name> <id> dot1ad <outer_id> inner-dot1q <inner_id> [exact-match]
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// is a double dot1ad vlan with outer vlan id <outer_id> and inner vlan
+// id <inner_id>.
+//
+// <intf-name> <id> untagged
+// - a subinterface with the name <intf-name>.<id>. The subinterface
+// has no vlan tags. Only one can be specified per interface.
+//
+// <intf-name> <id> default
+// - a subinterface with the name <intf-name>.<id>. This is associated
+// with a packet that did not match any other configured subinterface
+// on this interface. Only one can be specified per interface.
+
+
+static clib_error_t *
+parse_vlan_sub_interfaces (unformat_input_t * input,
+ vnet_sw_interface_t * template)
+{
+ clib_error_t * error = 0;
+ u32 inner_vlan, outer_vlan;
+
+ if (unformat (input, "any inner-dot1q any")) {
+ template->sub.eth.flags.two_tags = 1;
+ template->sub.eth.flags.outer_vlan_id_any = 1;
+ template->sub.eth.flags.inner_vlan_id_any = 1;
+ } else if (unformat (input, "any")) {
+ template->sub.eth.flags.one_tag = 1;
+ template->sub.eth.flags.outer_vlan_id_any = 1;
+ } else if (unformat (input, "%d inner-dot1q any", &outer_vlan)) {
+ template->sub.eth.flags.two_tags = 1;
+ template->sub.eth.flags.inner_vlan_id_any = 1;
+ template->sub.eth.outer_vlan_id = outer_vlan;
+ } else if (unformat (input, "%d inner-dot1q %d", &outer_vlan, &inner_vlan)) {
+ template->sub.eth.flags.two_tags = 1;
+ template->sub.eth.outer_vlan_id = outer_vlan;
+ template->sub.eth.inner_vlan_id = inner_vlan;
+ } else if (unformat (input, "%d", &outer_vlan)) {
+ template->sub.eth.flags.one_tag = 1;
+ template->sub.eth.outer_vlan_id = outer_vlan;
+ } else {
+ error = clib_error_return (0, "expected dot1q config, got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "exact-match")) {
+ template->sub.eth.flags.exact_match = 1;
+ }
+ }
+
+ done:
+ return error;
+}
+
+static clib_error_t *
+create_sub_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 hw_if_index, sw_if_index;
+ vnet_hw_interface_t * hi;
+ u32 id, id_min, id_max;
+ vnet_sw_interface_t template;
+
+ hw_if_index = ~0;
+ if (! unformat_user (input, unformat_vnet_hw_interface, vnm, &hw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ memset (&template, 0, sizeof (template));
+ template.sub.eth.raw_flags = 0;
+
+ if (unformat (input, "%d default", &id_min)) {
+ id_max = id_min;
+ template.sub.eth.flags.default_sub = 1;
+ } else if (unformat (input, "%d untagged", &id_min)) {
+ id_max = id_min;
+ template.sub.eth.flags.no_tags = 1;
+ template.sub.eth.flags.exact_match = 1;
+ } else if (unformat (input, "%d dot1q", &id_min)) {
+ // parse dot1q config
+ id_max = id_min;
+ error = parse_vlan_sub_interfaces(input, &template);
+ if (error) goto done;
+ } else if (unformat (input, "%d dot1ad", &id_min)) {
+ // parse dot1ad config
+ id_max = id_min;
+ template.sub.eth.flags.dot1ad = 1;
+ error = parse_vlan_sub_interfaces(input, &template);
+ if (error) goto done;
+ } else if (unformat (input, "%d-%d", &id_min, &id_max)) {
+ template.sub.eth.flags.one_tag = 1;
+ template.sub.eth.outer_vlan_id = id_min;
+ template.sub.eth.flags.exact_match = 1;
+ if (id_min > id_max)
+ goto id_error;
+ } else if (unformat (input, "%d", &id_min)) {
+ id_max = id_min;
+ template.sub.eth.flags.one_tag = 1;
+ template.sub.eth.outer_vlan_id = id_min;
+ template.sub.eth.flags.exact_match = 1;
+ } else {
+ id_error:
+ error = clib_error_return (0, "expected ID or ID MIN-MAX, got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ /*
+ if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ error = clib_error_return (0, "unexpected text `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ */
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ for (id = id_min; id <= id_max; id++)
+ {
+ uword * p;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u64 sup_and_sub_key = ((u64)(hi->sw_if_index) << 32) |
+ (u64) id;
+ u64 * kp;
+
+ p = hash_get_mem (im->sw_if_index_by_sup_and_sub, &sup_and_sub_key);
+ if (p)
+ {
+ if (CLIB_DEBUG > 0)
+ clib_warning ("sup sw_if_index %d, sub id %d already exists\n",
+ hi->sw_if_index, id);
+ continue;
+ }
+
+ kp = clib_mem_alloc (sizeof (*kp));
+ *kp = sup_and_sub_key;
+
+ template.type = VNET_SW_INTERFACE_TYPE_SUB;
+ template.sup_sw_if_index = hi->sw_if_index;
+ template.sub.id = id;
+ error = vnet_create_sw_interface (vnm, &template, &sw_if_index);
+ if (error) goto done;
+ hash_set (hi->sub_interface_sw_if_index_by_id, id, sw_if_index);
+ hash_set_mem (im->sw_if_index_by_sup_and_sub, kp, sw_if_index);
+ }
+
+ if (error)
+ goto done;
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (create_sub_interfaces_command, static) = {
+ .path = "create sub-interface",
+ .short_help = "create sub-interfaces <nn>[-<nn>] [dot1q|dot1ad|default|untagged]",
+ .function = create_sub_interfaces,
+};
+
+static clib_error_t *
+set_state (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error;
+ u32 sw_if_index, flags;
+
+ sw_if_index = ~0;
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (! unformat (input, "%U", unformat_vnet_sw_interface_flags, &flags))
+ {
+ error = clib_error_return (0, "unknown flags `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ error = vnet_sw_interface_set_flags (vnm, sw_if_index, flags);
+ if (error)
+ goto done;
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_state_command, static) = {
+ .path = "set interface state",
+ .short_help = "Set interface state",
+ .function = set_state,
+};
+
+static clib_error_t *
+set_unnumbered (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 unnumbered_sw_if_index;
+ u32 inherit_from_sw_if_index;
+ vnet_sw_interface_t * si;
+ int is_set = 0;
+ int is_del = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+
+ if (unformat (input, "%U use %U",
+ unformat_vnet_sw_interface, vnm, &unnumbered_sw_if_index,
+ unformat_vnet_sw_interface, vnm, &inherit_from_sw_if_index))
+ is_set = 1;
+ else if (unformat (input, "del %U",
+ unformat_vnet_sw_interface,
+ vnm, &unnumbered_sw_if_index))
+ is_del = 1;
+ else
+ {
+ if (is_set || is_del)
+ break;
+ else
+ return clib_error_return
+ (0, "parse error '%U'", format_unformat_error, input);
+ }
+ }
+
+ si = vnet_get_sw_interface (vnm, unnumbered_sw_if_index);
+ if (is_del) {
+ si->flags &= ~(VNET_SW_INTERFACE_FLAG_UNNUMBERED);
+ si->unnumbered_sw_if_index = (u32)~0;
+ } else {
+ si->flags |= VNET_SW_INTERFACE_FLAG_UNNUMBERED;
+ si->unnumbered_sw_if_index = inherit_from_sw_if_index;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_unnumbered_command, static) = {
+ .path = "set interface unnumbered",
+ .short_help = "set interface unnumbered [<intfc> use <intfc>][del <intfc>]",
+ .function = set_unnumbered,
+};
+
+
+
+static clib_error_t *
+set_hw_class (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ clib_error_t * error;
+ u32 hw_if_index, hw_class_index;
+
+ hw_if_index = ~0;
+ if (! unformat_user (input, unformat_vnet_hw_interface, vnm, &hw_if_index))
+ {
+ error = clib_error_return (0, "unknown hardware interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (! unformat_user (input, unformat_hash_string,
+ im->hw_interface_class_by_name, &hw_class_index))
+ {
+ error = clib_error_return (0, "unknown hardware class `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ error = vnet_hw_interface_set_class (vnm, hw_if_index, hw_class_index);
+ if (error)
+ goto done;
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_hw_class_command, static) = {
+ .path = "set interface hw-class",
+ .short_help = "Set interface hardware class",
+ .function = set_hw_class,
+};
+
+static clib_error_t * vnet_interface_cli_init (vlib_main_t * vm)
+{ return 0; }
+
+VLIB_INIT_FUNCTION (vnet_interface_cli_init);
+
+static clib_error_t *
+renumber_interface_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 hw_if_index;
+ u32 new_dev_instance;
+ vnet_main_t * vnm = vnet_get_main();
+ int rv;
+
+ if (! unformat_user (input, unformat_vnet_hw_interface, vnm, &hw_if_index))
+ return clib_error_return (0, "unknown hardware interface `%U'",
+ format_unformat_error, input);
+
+ if (! unformat (input, "%d", &new_dev_instance))
+ return clib_error_return (0, "new dev instance missing");
+
+ rv = vnet_interface_name_renumber (hw_if_index, new_dev_instance);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ default:
+ return clib_error_return (0, "vnet_interface_name_renumber returned %d",
+ rv);
+
+ }
+
+ return 0;
+}
+
+
+VLIB_CLI_COMMAND (renumber_interface_command, static) = {
+ .path = "renumber interface",
+ .short_help = "renumber interface <if-name> <new-dev-instance>",
+ .function = renumber_interface_command_fn,
+};
+
diff --git a/vnet/vnet/interface_format.c b/vnet/vnet/interface_format.c
new file mode 100644
index 00000000000..56faea27217
--- /dev/null
+++ b/vnet/vnet/interface_format.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface_format.c: interface formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+
+u8 * format_vnet_sw_interface_flags (u8 * s, va_list * args)
+{
+ u32 flags = va_arg (*args, u32);
+
+ s = format (s, "%s", (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? "up" : "down");
+ if (flags & VNET_SW_INTERFACE_FLAG_PUNT)
+ s = format (s, "/punt");
+
+ return s;
+}
+
+u8 * format_vnet_hw_interface (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ vnet_hw_interface_t * hi = va_arg (*args, vnet_hw_interface_t *);
+ vnet_hw_interface_class_t * hw_class;
+ vnet_device_class_t * dev_class;
+ int verbose = va_arg (*args, int);
+ uword indent;
+
+ if (! hi)
+ return format (s, "%=32s%=6s%=8s%s",
+ "Name", "Idx", "Link", "Hardware");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "%-32v%=6d%=8s",
+ hi->name, hi->hw_if_index,
+ hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP ? "up" : "down");
+
+ hw_class = vnet_get_hw_interface_class (vnm, hi->hw_class_index);
+ dev_class = vnet_get_device_class (vnm, hi->dev_class_index);
+
+ if (dev_class->format_device_name)
+ s = format (s, "%U", dev_class->format_device_name, hi->dev_instance);
+ else
+ s = format (s, "%s%d", dev_class->name, hi->dev_instance);
+
+ if (verbose)
+ {
+ if (hw_class->format_device)
+ s = format (s, "\n%U%U",
+ format_white_space, indent + 2,
+ hw_class->format_device, hi->hw_if_index, verbose);
+ else
+ {
+ s = format (s, "\n%U%s",
+ format_white_space, indent + 2,
+ hw_class->name);
+ if (hw_class->format_address && vec_len (hi->hw_address) > 0)
+ s = format (s, " address %U", hw_class->format_address, hi->hw_address);
+ }
+
+ if (dev_class->format_device)
+ s = format (s, "\n%U%U",
+ format_white_space, indent + 2,
+ dev_class->format_device, hi->dev_instance, verbose);
+ }
+
+ return s;
+}
+
+u8 * format_vnet_sw_interface_name (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ vnet_sw_interface_t * si = va_arg (*args, vnet_sw_interface_t *);
+ vnet_sw_interface_t * si_sup = vnet_get_sup_sw_interface (vnm, si->sw_if_index);
+ vnet_hw_interface_t * hi_sup;
+
+ ASSERT (si_sup->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ hi_sup = vnet_get_hw_interface (vnm, si_sup->hw_if_index);
+
+ s = format (s, "%v", hi_sup->name);
+
+ if (si->type != VNET_SW_INTERFACE_TYPE_HARDWARE)
+ s = format (s, ".%d", si->sub.id);
+
+ return s;
+}
+
+u8 * format_vnet_sw_if_index_name (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ u32 sw_if_index = va_arg (*args, u32);
+ return format (s, "%U",
+ format_vnet_sw_interface_name, vnm,
+ vnet_get_sw_interface (vnm, sw_if_index));
+}
+
+u8 * format_vnet_sw_interface (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ vnet_sw_interface_t * si = va_arg (*args, vnet_sw_interface_t *);
+ vnet_interface_main_t * im = &vnm->interface_main;
+ uword indent, n_printed;
+ int i, j, n_counters;
+ static vnet_main_t ** my_vnet_mains;
+
+ if (! si)
+ return format (s, "%=32s%=5s%=16s%=16s%=16s",
+ "Name", "Idx", "State", "Counter", "Count");
+
+ s = format (s, "%-32U%=5d%=16U",
+ format_vnet_sw_interface_name, vnm, si, si->sw_if_index,
+ format_vnet_sw_interface_flags, si->flags);
+
+ vec_reset_length (my_vnet_mains);
+
+ indent = format_get_indent (s);
+ n_printed = 0;
+
+ {
+ vlib_combined_counter_main_t * cm;
+ vlib_counter_t v, vtotal;
+ u8 * n = 0;
+
+ for (i = 0; i < vec_len (vnet_mains); i++)
+ {
+ if (vnet_mains[i])
+ vec_add1 (my_vnet_mains, vnet_mains[i]);
+ }
+
+ if (vec_len(my_vnet_mains) == 0)
+ vec_add1 (my_vnet_mains, &vnet_main);
+
+ /* Each vnet_main_t has its own copy of the interface counters */
+ n_counters = vec_len (im->combined_sw_if_counters);
+
+ /* rx, tx counters... */
+ for (j = 0; j < n_counters; j++)
+ {
+ vtotal.packets = 0;
+ vtotal.bytes = 0;
+
+ for (i = 0; i < vec_len(my_vnet_mains); i++)
+ {
+ im = &my_vnet_mains[i]->interface_main;
+ cm = im->combined_sw_if_counters + j;
+ vlib_get_combined_counter (cm, si->sw_if_index, &v);
+ vtotal.packets += v.packets;
+ vtotal.bytes += v.bytes;
+ }
+
+ /* Only display non-zero counters. */
+ if (vtotal.packets == 0)
+ continue;
+
+ if (n_printed > 0)
+ s = format (s, "\n%U", format_white_space, indent);
+ n_printed += 2;
+
+ if (n)
+ _vec_len (n) = 0;
+ n = format (n, "%s packets", cm->name);
+ s = format (s, "%-16v%16Ld", n, vtotal.packets);
+
+ _vec_len (n) = 0;
+ n = format (n, "%s bytes", cm->name);
+ s = format (s, "\n%U%-16v%16Ld",
+ format_white_space, indent,
+ n, vtotal.bytes);
+ }
+ vec_free (n);
+ }
+
+ {
+ vlib_simple_counter_main_t * cm;
+ u64 v, vtotal ;
+
+ n_counters = vec_len (im->sw_if_counters);
+
+ for (j = 0; j < n_counters; j++)
+ {
+ vtotal = 0;
+
+ for (i = 0; i < vec_len(my_vnet_mains); i++)
+ {
+ im = &my_vnet_mains[i]->interface_main;
+ cm = im->sw_if_counters + j;
+
+ v = vlib_get_simple_counter (cm, si->sw_if_index);
+ vtotal += v;
+ }
+
+ /* Only display non-zero counters. */
+ if (vtotal == 0)
+ continue;
+
+ if (n_printed > 0)
+ s = format (s, "\n%U", format_white_space, indent);
+ n_printed += 1;
+
+ s = format (s, "%-16s%16Ld", cm->name, vtotal);
+ }
+ }
+
+ return s;
+}
+
+uword unformat_vnet_hw_interface (unformat_input_t * input, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ u32 * hw_if_index = va_arg (*args, u32 *);
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_device_class_t * c;
+
+ /* Try per device class functions first. */
+ vec_foreach (c, im->device_classes)
+ {
+ if (c->unformat_device_name
+ && unformat_user (input, c->unformat_device_name, hw_if_index))
+ return 1;
+ }
+
+ return unformat_user (input, unformat_hash_vec_string,
+ im->hw_interface_by_name, hw_if_index);
+}
+
+uword unformat_vnet_sw_interface (unformat_input_t * input, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ u32 * result = va_arg (*args, u32 *);
+ vnet_hw_interface_t * hi;
+ u32 hw_if_index, id, id_specified;
+ u8 * if_name = 0;
+ uword * p, error = 0;
+
+ id = ~0;
+ if (unformat (input, "%_%v.%d%_", &if_name, &id)
+ && ((p = hash_get (vnm->interface_main.hw_interface_by_name, if_name))))
+ {
+ hw_if_index = p[0];
+ id_specified = 1;
+ }
+ else if (unformat (input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
+ id_specified = 0;
+ else
+ goto done;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ if (! id_specified)
+ {
+ *result = hi->sw_if_index;
+ }
+ else
+ {
+ if (! (p = hash_get (hi->sub_interface_sw_if_index_by_id, id)))
+ return 0;
+ *result = p[0];
+ }
+ error = 1;
+ done:
+ vec_free (if_name);
+ return error;
+}
+
+uword unformat_vnet_sw_interface_flags (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 flags = 0;
+
+ if (unformat (input, "up"))
+ flags |= VNET_SW_INTERFACE_FLAG_ADMIN_UP;
+ else if (unformat (input, "down"))
+ flags &= ~VNET_SW_INTERFACE_FLAG_ADMIN_UP;
+ else if (unformat (input, "punt"))
+ flags |= VNET_SW_INTERFACE_FLAG_PUNT;
+ else if (unformat (input, "enable"))
+ flags &= ~VNET_SW_INTERFACE_FLAG_PUNT;
+ else
+ return 0;
+
+ *result = flags;
+ return 1;
+}
+
+uword unformat_vnet_hw_interface_flags (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 flags = 0;
+
+ if (unformat (input, "up"))
+ flags |= VNET_HW_INTERFACE_FLAG_LINK_UP;
+ else if (unformat (input, "down"))
+ flags &= ~VNET_HW_INTERFACE_FLAG_LINK_UP;
+ else
+ return 0;
+
+ *result = flags;
+ return 1;
+}
+
diff --git a/vnet/vnet/interface_funcs.h b/vnet/vnet/interface_funcs.h
new file mode 100644
index 00000000000..f3d3247fd5d
--- /dev/null
+++ b/vnet/vnet/interface_funcs.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface_funcs.h: VNET interfaces/sub-interfaces exported functions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_interface_funcs_h
+#define included_vnet_interface_funcs_h
+
+always_inline vnet_hw_interface_t *
+vnet_get_hw_interface (vnet_main_t * vnm, u32 hw_if_index)
+{ return pool_elt_at_index (vnm->interface_main.hw_interfaces, hw_if_index); }
+
+always_inline vnet_sw_interface_t *
+vnet_get_sw_interface (vnet_main_t * vnm, u32 sw_if_index)
+{ return pool_elt_at_index (vnm->interface_main.sw_interfaces, sw_if_index); }
+
+always_inline vnet_sw_interface_t *
+vnet_get_hw_sw_interface (vnet_main_t * vnm, u32 hw_if_index)
+{
+ vnet_hw_interface_t * hw = vnet_get_hw_interface (vnm, hw_if_index);
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, hw->sw_if_index);
+ ASSERT (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ return sw;
+}
+
+always_inline vnet_sw_interface_t *
+vnet_get_sup_sw_interface (vnet_main_t * vnm, u32 sw_if_index)
+{
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index);
+ if (sw->type == VNET_SW_INTERFACE_TYPE_SUB)
+ sw = vnet_get_sw_interface (vnm, sw->sup_sw_if_index);
+ return sw;
+}
+
+always_inline vnet_hw_interface_t *
+vnet_get_sup_hw_interface (vnet_main_t * vnm, u32 sw_if_index)
+{
+ vnet_sw_interface_t * sw = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ ASSERT (sw->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ return vnet_get_hw_interface (vnm, sw->hw_if_index);
+}
+
+always_inline vnet_hw_interface_class_t *
+vnet_get_hw_interface_class (vnet_main_t * vnm, u32 hw_class_index)
+{ return vec_elt_at_index (vnm->interface_main.hw_interface_classes, hw_class_index); }
+
+always_inline vnet_device_class_t *
+vnet_get_device_class (vnet_main_t * vnm, u32 dev_class_index)
+{ return vec_elt_at_index (vnm->interface_main.device_classes, dev_class_index); }
+
+/* Register a hardware interface instance. */
+u32 vnet_register_interface (vnet_main_t * vnm,
+ u32 dev_class_index,
+ u32 dev_instance,
+ u32 hw_class_index,
+ u32 hw_instance);
+
+/* Creates a software interface given template. */
+clib_error_t *
+vnet_create_sw_interface (vnet_main_t * vnm, vnet_sw_interface_t * template, u32 * sw_if_index);
+
+void vnet_delete_hw_interface (vnet_main_t * vnm, u32 hw_if_index);
+void vnet_delete_sw_interface (vnet_main_t * vnm, u32 sw_if_index);
+
+always_inline uword
+vnet_sw_interface_get_flags (vnet_main_t * vnm, u32 sw_if_index)
+{
+ vnet_sw_interface_t * sw = vnet_get_sw_interface (vnm, sw_if_index);
+ return sw->flags;
+}
+
+always_inline uword
+vnet_sw_interface_is_admin_up (vnet_main_t * vnm, u32 sw_if_index)
+{ return (vnet_sw_interface_get_flags (vnm, sw_if_index) & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; }
+
+always_inline uword
+vnet_hw_interface_get_flags (vnet_main_t * vnm, u32 hw_if_index)
+{
+ vnet_hw_interface_t * hw = vnet_get_hw_interface (vnm, hw_if_index);
+ return hw->flags;
+}
+
+always_inline uword
+vnet_hw_interface_is_link_up (vnet_main_t * vnm, u32 hw_if_index)
+{ return (vnet_hw_interface_get_flags (vnm, hw_if_index) & VNET_HW_INTERFACE_FLAG_LINK_UP) != 0; }
+
+always_inline vlib_frame_t *
+vnet_get_frame_to_sw_interface (vnet_main_t * vnm, u32 sw_if_index)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ return vlib_get_frame_to_node (vnm->vlib_main, hw->output_node_index);
+}
+
+always_inline void
+vnet_put_frame_to_sw_interface (vnet_main_t * vnm, u32 sw_if_index, vlib_frame_t * f)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ return vlib_put_frame_to_node (vnm->vlib_main, hw->output_node_index, f);
+}
+
+/* Change interface flags (e.g. up, down, enable, disable). */
+clib_error_t *
+vnet_hw_interface_set_flags (vnet_main_t * vnm, u32 hw_if_index, u32 flags);
+
+/* Change interface flags (e.g. up, down, enable, disable). */
+clib_error_t *
+vnet_sw_interface_set_flags (vnet_main_t * vnm, u32 sw_if_index, u32 flags);
+
+/* Change interface class. */
+clib_error_t *
+vnet_hw_interface_set_class (vnet_main_t * vnm, u32 hw_if_index, u32 new_hw_class_index);
+
+/* Redirect rx pkts to node */
+int vnet_hw_interface_rx_redirect_to_node (vnet_main_t * vnm, u32 hw_if_index,
+ u32 node_index);
+
+void vnet_hw_interface_init_for_class (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index, u32 hw_instance);
+
+/* Formats sw/hw interface. */
+format_function_t format_vnet_hw_interface;
+format_function_t format_vnet_sw_interface;
+format_function_t format_vnet_sw_interface_name;
+format_function_t format_vnet_sw_if_index_name;
+format_function_t format_vnet_sw_interface_flags;
+
+/* Parses sw/hw interface name -> index. */
+unformat_function_t unformat_vnet_sw_interface;
+unformat_function_t unformat_vnet_hw_interface;
+
+/* Parses interface flags (up, down, enable, disable, etc.) */
+unformat_function_t unformat_vnet_hw_interface_flags;
+unformat_function_t unformat_vnet_sw_interface_flags;
+
+/* Node runtime for interface output function. */
+typedef struct {
+ u32 hw_if_index;
+ u32 sw_if_index;
+ u32 dev_instance;
+ u32 is_deleted;
+} vnet_interface_output_runtime_t;
+
+/* Interface output functions. */
+uword
+vnet_interface_output_node (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame);
+uword
+vnet_interface_output_node_no_flatten (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame);
+
+word vnet_sw_interface_compare (vnet_main_t * vnm, uword sw_if_index0, uword sw_if_index1);
+word vnet_hw_interface_compare (vnet_main_t * vnm, uword hw_if_index0, uword hw_if_index1);
+
+typedef enum {
+#define _(sym,str) VNET_INTERFACE_OUTPUT_NEXT_##sym,
+ foreach_intf_output_feat
+#undef _
+ VNET_INTERFACE_OUTPUT_NEXT_DROP,
+ VNET_INTERFACE_OUTPUT_NEXT_TX,
+} vnet_interface_output_next_t;
+
+typedef enum {
+ VNET_INTERFACE_TX_NEXT_DROP,
+ VNET_INTERFACE_TX_N_NEXT,
+} vnet_interface_tx_next_t;
+
+typedef enum {
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN,
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DELETED,
+} vnet_interface_output_error_t;
+
+/* Format for interface output traces. */
+u8 * format_vnet_interface_output_trace (u8 * s, va_list * va);
+
+serialize_function_t serialize_vnet_interface_state, unserialize_vnet_interface_state;
+
+#endif /* included_vnet_interface_funcs_h */
diff --git a/vnet/vnet/interface_output.c b/vnet/vnet/interface_output.c
new file mode 100644
index 00000000000..84dc0392a51
--- /dev/null
+++ b/vnet/vnet/interface_output.c
@@ -0,0 +1,1311 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * interface_output.c: interface output node
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+
+typedef struct {
+ u32 sw_if_index;
+ u8 data[64 - sizeof (u32)];
+} interface_output_trace_t;
+
+u8 * format_vnet_interface_output_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ vlib_node_t * node = va_arg (*va, vlib_node_t *);
+ interface_output_trace_t * t = va_arg (*va, interface_output_trace_t *);
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_sw_interface_t * si;
+ uword indent;
+
+ if (t->sw_if_index != (u32)~0)
+ {
+ si = vnet_get_sw_interface (vnm, t->sw_if_index);
+ indent = format_get_indent (s);
+
+ s = format (s, "%U\n%U%U",
+ format_vnet_sw_interface_name, vnm, si,
+ format_white_space, indent,
+ node->format_buffer ? node->format_buffer : format_hex_bytes,
+ t->data, sizeof (t->data));
+ }
+ return s;
+}
+
+static void
+vnet_interface_output_trace (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword n_buffers)
+{
+ u32 n_left, * from;
+
+ n_left = n_buffers;
+ from = vlib_frame_args (frame);
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ interface_output_trace_t * t0, * t1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+ memcpy (t0->data, vlib_buffer_get_current (b0),
+ sizeof (t0->data));
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1->sw_if_index = vnet_buffer (b1)->sw_if_index[VLIB_TX];
+ memcpy (t1->data, vlib_buffer_get_current (b1),
+ sizeof (t1->data));
+ }
+ from += 2;
+ n_left -= 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ interface_output_trace_t * t0;
+
+ bi0 = from[0];
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_TX];
+ memcpy (t0->data, vlib_buffer_get_current (b0),
+ sizeof (t0->data));
+ }
+ from += 1;
+ n_left -= 1;
+ }
+}
+
+static never_inline u32
+slow_path (vlib_main_t * vm,
+ u32 bi,
+ vlib_buffer_t * b,
+ u32 n_left_to_tx,
+ u32 * to_tx,
+ u32 * n_slow_bytes_result)
+{
+ /* We've already enqueued a single buffer. */
+ u32 n_buffers = 0;
+ u32 n_slow_bytes = 0;
+
+ while (n_left_to_tx > 0)
+ {
+ to_tx[0] = bi;
+ to_tx += 1;
+ n_left_to_tx -= 1;
+ n_buffers += 1;
+ n_slow_bytes += vlib_buffer_length_in_chain (vm, b);
+
+ /* Be grumpy about zero length buffers for benefit of
+ driver tx function. */
+ ASSERT (b->current_length > 0);
+
+ if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ break;
+
+ bi = b->next_buffer;
+ b = vlib_get_buffer (vm, bi);
+ }
+
+ /* Ran out of space in next frame trying to enqueue buffers? */
+ if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ return 0;
+
+ *n_slow_bytes_result = n_slow_bytes;
+ return n_buffers;
+}
+
+/*
+ * Increment TX stats. Roll up consecutive increments to the same sw_if_index
+ * into one increment.
+ */
+static_always_inline
+void incr_output_stats (vnet_main_t * vnm,
+ u32 cpu_index,
+ u32 length,
+ u32 sw_if_index,
+ u32 * last_sw_if_index,
+ u32 * n_packets,
+ u32 * n_bytes) {
+ vnet_interface_main_t * im;
+
+ if (PREDICT_TRUE (sw_if_index == *last_sw_if_index)) {
+ *n_packets += 1;
+ *n_bytes += length;
+ } else {
+ if (PREDICT_TRUE (*last_sw_if_index != ~0)) {
+ im = &vnm->interface_main;
+
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ *last_sw_if_index,
+ *n_packets,
+ *n_bytes);
+ }
+ *last_sw_if_index = sw_if_index;
+ *n_packets = 1;
+ *n_bytes = length;
+ }
+}
+
+
+/* Interface output functions. */
+uword
+vnet_interface_output_node (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_output_runtime_t * rt = (void *) node->runtime_data;
+ vnet_sw_interface_t * si;
+ vnet_hw_interface_t * hi;
+ u32 n_left_to_tx, * from, * from_end, * to_tx;
+ u32 n_bytes, n_buffers, n_packets;
+ u32 last_sw_if_index;
+ u32 cpu_index = vm->cpu_index;
+
+ n_buffers = frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vnet_interface_output_trace (vm, node, frame, n_buffers);
+
+ from = vlib_frame_args (frame);
+
+ if (rt->is_deleted)
+ return vlib_error_drop_buffers (vm, node,
+ from,
+ /* buffer stride */ 1,
+ n_buffers,
+ VNET_INTERFACE_OUTPUT_NEXT_DROP,
+ node->node_index,
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DELETED);
+
+ si = vnet_get_sw_interface (vnm, rt->sw_if_index);
+ hi = vnet_get_sup_hw_interface (vnm, rt->sw_if_index);
+ if (! (si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ||
+ ! (hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP))
+ {
+ vlib_simple_counter_main_t * cm;
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, cpu_index,
+ rt->sw_if_index, n_buffers);
+ return vlib_error_drop_buffers (vm, node,
+ from,
+ /* buffer stride */ 1,
+ n_buffers,
+ VNET_INTERFACE_OUTPUT_NEXT_DROP,
+ node->node_index,
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN);
+ }
+
+ from_end = from + n_buffers;
+
+ /* Total byte count of all buffers. */
+ n_bytes = 0;
+ n_packets = 0;
+ last_sw_if_index = ~0;
+
+ while (from < from_end)
+ {
+ /* Get new next frame since previous incomplete frame may have less
+ than VNET_FRAME_SIZE vectors in it. */
+ vlib_get_new_next_frame (vm, node, VNET_INTERFACE_OUTPUT_NEXT_TX,
+ to_tx, n_left_to_tx);
+
+ while (from + 4 <= from_end && n_left_to_tx >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_tx[0] = bi0;
+ to_tx[1] = bi1;
+ from += 2;
+ to_tx += 2;
+ n_left_to_tx -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* Be grumpy about zero length buffers for benefit of
+ driver tx function. */
+ ASSERT (b0->current_length > 0);
+ ASSERT (b1->current_length > 0);
+
+ if (PREDICT_FALSE ((b0->flags | b1->flags) & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ u32 n_buffers, n_slow_bytes, i;
+
+ /* Undo. */
+ from -= 2;
+ to_tx -= 2;
+ n_left_to_tx += 2;
+
+ /* Do slow path two times. */
+ for (i = 0; i < 2; i++)
+ {
+ u32 bi = i ? bi1 : bi0;
+ vlib_buffer_t * b = i ? b1 : b0;
+
+ n_buffers = slow_path (vm, bi, b,
+ n_left_to_tx, to_tx, &n_slow_bytes);
+
+ /* Not enough room for single packet? */
+ if (n_buffers == 0)
+ goto put;
+
+ from += 1;
+ to_tx += n_buffers;
+ n_left_to_tx -= n_buffers;
+ incr_output_stats (vnm, cpu_index, n_slow_bytes,
+ vnet_buffer(b)->sw_if_index[VLIB_TX],
+ &last_sw_if_index, &n_packets, &n_bytes);
+ }
+ } else {
+ incr_output_stats (vnm, cpu_index,
+ vlib_buffer_length_in_chain (vm, b0),
+ vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ &last_sw_if_index, &n_packets, &n_bytes);
+ incr_output_stats (vnm, cpu_index,
+ vlib_buffer_length_in_chain (vm, b0),
+ vnet_buffer(b1)->sw_if_index[VLIB_TX],
+ &last_sw_if_index, &n_packets, &n_bytes);
+ }
+ }
+
+ while (from + 1 <= from_end && n_left_to_tx >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+
+ bi0 = from[0];
+ to_tx[0] = bi0;
+ from += 1;
+ to_tx += 1;
+ n_left_to_tx -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* Be grumpy about zero length buffers for benefit of
+ driver tx function. */
+ ASSERT (b0->current_length > 0);
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ u32 n_buffers, n_slow_bytes;
+
+ /* Undo. */
+ from -= 1;
+ to_tx -= 1;
+ n_left_to_tx += 1;
+
+ n_buffers = slow_path (vm, bi0, b0,
+ n_left_to_tx, to_tx, &n_slow_bytes);
+
+ /* Not enough room for single packet? */
+ if (n_buffers == 0)
+ goto put;
+
+ from += 1;
+ to_tx += n_buffers;
+ n_left_to_tx -= n_buffers;
+ }
+ incr_output_stats (vnm, cpu_index,
+ vlib_buffer_length_in_chain (vm, b0),
+ vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ &last_sw_if_index, &n_packets, &n_bytes);
+ }
+
+ put:
+ vlib_put_next_frame (vm, node, VNET_INTERFACE_OUTPUT_NEXT_TX, n_left_to_tx);
+ }
+
+ /* Final update of interface stats. */
+ incr_output_stats (vnm, cpu_index, 0, ~0, /* ~0 will flush stats */
+ &last_sw_if_index, &n_packets, &n_bytes);
+
+ return n_buffers;
+}
+
+
+uword
+vnet_interface_output_node_no_flatten (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_output_runtime_t * rt = (void *) node->runtime_data;
+ vnet_sw_interface_t * si;
+ vnet_hw_interface_t * hi;
+ u32 n_left_to_tx, * from, * from_end, * to_tx;
+ u32 n_bytes, n_buffers, n_packets;
+ u32 n_bytes_b0, n_bytes_b1;
+ u32 cpu_index = vm->cpu_index;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
+
+ n_buffers = frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vnet_interface_output_trace (vm, node, frame, n_buffers);
+
+ from = vlib_frame_args (frame);
+
+ if (rt->is_deleted)
+ return vlib_error_drop_buffers (vm, node,
+ from,
+ /* buffer stride */ 1,
+ n_buffers,
+ VNET_INTERFACE_OUTPUT_NEXT_DROP,
+ node->node_index,
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DELETED);
+
+ si = vnet_get_sw_interface (vnm, rt->sw_if_index);
+ hi = vnet_get_sup_hw_interface (vnm, rt->sw_if_index);
+ if (! (si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ||
+ ! (hi->flags & VNET_HW_INTERFACE_FLAG_LINK_UP))
+ {
+ vlib_simple_counter_main_t * cm;
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_TX_ERROR);
+ vlib_increment_simple_counter (cm, cpu_index,
+ rt->sw_if_index, n_buffers);
+
+ return vlib_error_drop_buffers (vm, node,
+ from,
+ /* buffer stride */ 1,
+ n_buffers,
+ VNET_INTERFACE_OUTPUT_NEXT_DROP,
+ node->node_index,
+ VNET_INTERFACE_OUTPUT_ERROR_INTERFACE_DOWN);
+ }
+
+ from_end = from + n_buffers;
+
+ /* Total byte count of all buffers. */
+ n_bytes = 0;
+ n_packets = 0;
+
+ while (from < from_end)
+ {
+ /* Get new next frame since previous incomplete frame may have less
+ than VNET_FRAME_SIZE vectors in it. */
+ vlib_get_new_next_frame (vm, node, next_index,
+ to_tx, n_left_to_tx);
+
+ while (from + 4 <= from_end && n_left_to_tx >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 tx_swif0, tx_swif1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_tx[0] = bi0;
+ to_tx[1] = bi1;
+ from += 2;
+ to_tx += 2;
+ n_left_to_tx -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* Be grumpy about zero length buffers for benefit of
+ driver tx function. */
+ ASSERT (b0->current_length > 0);
+ ASSERT (b1->current_length > 0);
+
+ n_bytes_b0 = vlib_buffer_length_in_chain (vm, b0);
+ n_bytes_b1 = vlib_buffer_length_in_chain (vm, b1);
+ tx_swif0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ tx_swif1 = vnet_buffer(b1)->sw_if_index[VLIB_TX];
+
+ n_bytes += n_bytes_b0 + n_bytes_b1;
+ n_packets += 2;
+
+ if (PREDICT_FALSE(si->output_feature_bitmap &&
+ vnet_buffer(b0)->output_features.bitmap != (1 << INTF_OUTPUT_FEAT_DONE)))
+ {
+ u32 next0;
+ vnet_buffer(b0)->output_features.bitmap = si->output_feature_bitmap;
+ count_trailing_zeros(next0, vnet_buffer(b0)->output_features.bitmap);
+ vnet_buffer(b0)->output_features.bitmap &= ~(1 << next0);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_tx,
+ n_left_to_tx, bi0, next0);
+ }
+ else
+ {
+ vnet_buffer(b0)->output_features.bitmap = 0;
+
+ if (PREDICT_FALSE(tx_swif0 != rt->sw_if_index))
+ {
+ /* update vlan subif tx counts, if required */
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ tx_swif0,
+ 1,
+ n_bytes_b0);
+ }
+ }
+
+ if (PREDICT_FALSE(si->output_feature_bitmap &&
+ vnet_buffer(b1)->output_features.bitmap != (1 << INTF_OUTPUT_FEAT_DONE)))
+ {
+ u32 next1;
+ vnet_buffer(b1)->output_features.bitmap = si->output_feature_bitmap;
+ count_trailing_zeros(next1, vnet_buffer(b1)->output_features.bitmap);
+ vnet_buffer(b1)->output_features.bitmap &= ~(1 << next1);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_tx,
+ n_left_to_tx, bi1, next1);
+ }
+ else
+ {
+ vnet_buffer(b1)->output_features.bitmap = 0;
+
+ /* update vlan subif tx counts, if required */
+ if (PREDICT_FALSE(tx_swif1 != rt->sw_if_index))
+ {
+
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ tx_swif1,
+ 1,
+ n_bytes_b1);
+ }
+ }
+
+ }
+
+ while (from + 1 <= from_end && n_left_to_tx >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 tx_swif0;
+
+ bi0 = from[0];
+ to_tx[0] = bi0;
+ from += 1;
+ to_tx += 1;
+ n_left_to_tx -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* Be grumpy about zero length buffers for benefit of
+ driver tx function. */
+ ASSERT (b0->current_length > 0);
+
+ n_bytes_b0 = vlib_buffer_length_in_chain (vm, b0);
+ tx_swif0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ n_bytes += n_bytes_b0;
+ n_packets += 1;
+
+ if (PREDICT_FALSE(si->output_feature_bitmap &&
+ vnet_buffer(b0)->output_features.bitmap != (1 << INTF_OUTPUT_FEAT_DONE)))
+ {
+ u32 next0;
+ vnet_buffer(b0)->output_features.bitmap = si->output_feature_bitmap;
+ count_trailing_zeros(next0, vnet_buffer(b0)->output_features.bitmap);
+ vnet_buffer(b0)->output_features.bitmap &= ~(1 << next0);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_tx,
+ n_left_to_tx, bi0, next0);
+ }
+ else
+ {
+ vnet_buffer(b0)->output_features.bitmap = 0;
+
+ if (PREDICT_FALSE(tx_swif0 != rt->sw_if_index))
+ {
+
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ tx_swif0,
+ 1,
+ n_bytes_b0);
+ }
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_tx);
+ }
+
+ /* Update main interface stats. */
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ rt->sw_if_index,
+ n_packets,
+ n_bytes);
+ return n_buffers;
+}
+
+
+/* Use buffer's sw_if_index[VNET_TX] to choose output interface. */
+static uword
+vnet_per_buffer_interface_output (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 n_left_to_next, * from, * to_next;
+ u32 n_left_from, next_index;
+
+ n_left_from = frame->n_vectors;
+
+ from = vlib_frame_args (frame);
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1, next0, next1;
+ vlib_buffer_t * b0, * b1;
+ vnet_hw_interface_t * hi0, * hi1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ hi0 = vnet_get_sup_hw_interface (vnm, vnet_buffer (b0)->sw_if_index[VLIB_TX]);
+ hi1 = vnet_get_sup_hw_interface (vnm, vnet_buffer (b1)->sw_if_index[VLIB_TX]);
+
+ next0 = hi0->hw_if_index;
+ next1 = hi1->hw_if_index;
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t * b0;
+ vnet_hw_interface_t * hi0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ hi0 = vnet_get_sup_hw_interface (vnm, vnet_buffer (b0)->sw_if_index[VLIB_TX]);
+
+ next0 = hi0->hw_if_index;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+always_inline u32
+counter_index (vlib_main_t * vm, vlib_error_t e)
+{
+ vlib_node_t * n;
+ u32 ci, ni;
+
+ ni = vlib_error_get_node (e);
+ n = vlib_get_node (vm, ni);
+
+ ci = vlib_error_get_code (e);
+ ASSERT (ci < n->n_errors);
+
+ ci += n->error_heap_index;
+
+ return ci;
+}
+
+static u8 * format_vnet_error_trace (u8 * s, va_list * va)
+{
+ vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ vlib_error_t * e = va_arg (*va, vlib_error_t *);
+ vlib_node_t * error_node;
+ vlib_error_main_t * em = &vm->error_main;
+ u32 i;
+
+ error_node = vlib_get_node (vm, vlib_error_get_node (e[0]));
+ i = counter_index (vm, e[0]);
+ s = format (s, "%v: %s", error_node->name, em->error_strings_heap[i]);
+
+ return s;
+}
+
+static void
+trace_errors_with_buffers (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left, * buffers;
+
+ buffers = vlib_frame_vector_args (frame);
+ n_left = frame->n_vectors;
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ vlib_error_t * t0, * t1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD);
+
+ bi0 = buffers[0];
+ bi1 = buffers[1];
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0[0] = b0->error;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1[0] = b1->error;
+ }
+ buffers += 2;
+ n_left -= 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ vlib_error_t * t0;
+
+ bi0 = buffers[0];
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0[0] = b0->error;
+ }
+ buffers += 1;
+ n_left -= 1;
+ }
+}
+
+static u8 *
+validate_error (vlib_main_t * vm, vlib_error_t * e, u32 index)
+{
+ uword node_index = vlib_error_get_node (e[0]);
+ uword code = vlib_error_get_code (e[0]);
+ vlib_node_t * n;
+
+ if (node_index >= vec_len (vm->node_main.nodes))
+ return format (0, "[%d], node index out of range 0x%x, error 0x%x",
+ index, node_index, e[0]);
+
+ n = vlib_get_node (vm, node_index);
+ if (code >= n->n_errors)
+ return format (0, "[%d], code %d out of range for node %v",
+ index, code, n->name);
+
+ return 0;
+}
+
+static u8 *
+validate_error_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ u32 * buffers = vlib_frame_args (f);
+ vlib_buffer_t * b;
+ u8 * msg = 0;
+ uword i;
+
+ for (i = 0; i < f->n_vectors; i++)
+ {
+ b = vlib_get_buffer (vm, buffers[i]);
+ msg = validate_error (vm, &b->error, i);
+ if (msg)
+ return msg;
+ }
+
+ return msg;
+}
+
+typedef enum {
+ VNET_ERROR_DISPOSITION_DROP,
+ VNET_ERROR_DISPOSITION_PUNT,
+ VNET_ERROR_N_DISPOSITION,
+} vnet_error_disposition_t;
+
+always_inline void
+do_packet (vlib_main_t * vm, vlib_error_t a)
+{
+ vlib_error_main_t * em = &vm->error_main;
+ u32 i = counter_index (vm, a);
+ em->counters[i] += 1;
+ vlib_error_elog_count (vm, i, 1);
+}
+
+static_always_inline uword
+process_drop_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ vnet_error_disposition_t disposition)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_error_main_t * em = &vm->error_main;
+ u32 * buffers, * first_buffer;
+ vlib_error_t current_error;
+ u32 current_counter_index, n_errors_left;
+ u32 current_sw_if_index, n_errors_current_sw_if_index;
+ u64 current_counter;
+ vlib_simple_counter_main_t * cm;
+ u32 cpu_index = vm->cpu_index;
+
+ static vlib_error_t memory[VNET_ERROR_N_DISPOSITION];
+ static char memory_init[VNET_ERROR_N_DISPOSITION];
+
+ buffers = vlib_frame_args (frame);
+ first_buffer = buffers;
+
+ {
+ vlib_buffer_t * b = vlib_get_buffer (vm, first_buffer[0]);
+
+ if (! memory_init[disposition])
+ {
+ memory_init[disposition] = 1;
+ memory[disposition] = b->error;
+ }
+
+ current_sw_if_index = vnet_buffer (b)->sw_if_index[VLIB_RX];
+ n_errors_current_sw_if_index = 0;
+ }
+
+ current_error = memory[disposition];
+ current_counter_index = counter_index (vm, memory[disposition]);
+ current_counter = em->counters[current_counter_index];
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ trace_errors_with_buffers (vm, node, frame);
+
+ n_errors_left = frame->n_vectors;
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ (disposition == VNET_ERROR_DISPOSITION_PUNT
+ ? VNET_INTERFACE_COUNTER_PUNT
+ : VNET_INTERFACE_COUNTER_DROP));
+
+ while (n_errors_left >= 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ vnet_sw_interface_t * sw_if0, * sw_if1;
+ vlib_error_t e0, e1;
+ u32 bi0, bi1;
+ u32 sw_if_index0, sw_if_index1;
+
+ bi0 = buffers[0];
+ bi1 = buffers[1];
+
+ buffers += 2;
+ n_errors_left -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ e0 = b0->error;
+ e1 = b1->error;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ /* Speculate that sw_if_index == sw_if_index[01]. */
+ n_errors_current_sw_if_index += 2;
+
+ /* Speculatively assume all 2 (node, code) pairs are equal
+ to current (node, code). */
+ current_counter += 2;
+
+ if (PREDICT_FALSE (e0 != current_error
+ || e1 != current_error
+ || sw_if_index0 != current_sw_if_index
+ || sw_if_index1 != current_sw_if_index))
+ {
+ current_counter -= 2;
+ n_errors_current_sw_if_index -= 2;
+
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+
+ /* Increment super-interface drop/punt counters for
+ sub-interfaces. */
+ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0);
+ vlib_increment_simple_counter
+ (cm, cpu_index, sw_if0->sup_sw_if_index,
+ sw_if0->sup_sw_if_index != sw_if_index0);
+
+ sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1);
+ vlib_increment_simple_counter
+ (cm, cpu_index, sw_if1->sup_sw_if_index,
+ sw_if1->sup_sw_if_index != sw_if_index1);
+
+ em->counters[current_counter_index] = current_counter;
+ do_packet (vm, e0);
+ do_packet (vm, e1);
+
+ /* For 2 repeated errors, change current error. */
+ if (e0 == e1 && e1 != current_error)
+ {
+ current_error = e0;
+ current_counter_index = counter_index (vm, e0);
+ }
+ current_counter = em->counters[current_counter_index];
+ }
+ }
+
+ while (n_errors_left >= 1)
+ {
+ vlib_buffer_t * b0;
+ vnet_sw_interface_t * sw_if0;
+ vlib_error_t e0;
+ u32 bi0, sw_if_index0;
+
+ bi0 = buffers[0];
+
+ buffers += 1;
+ n_errors_left -= 1;
+ current_counter += 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ e0 = b0->error;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ /* Increment drop/punt counters. */
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+
+ /* Increment super-interface drop/punt counters for sub-interfaces. */
+ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0);
+ vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index,
+ sw_if0->sup_sw_if_index != sw_if_index0);
+
+ if (PREDICT_FALSE (e0 != current_error))
+ {
+ current_counter -= 1;
+
+ vlib_error_elog_count (vm, current_counter_index,
+ (current_counter
+ - em->counters[current_counter_index]));
+
+ em->counters[current_counter_index] = current_counter;
+
+ do_packet (vm, e0);
+ current_error = e0;
+ current_counter_index = counter_index (vm, e0);
+ current_counter = em->counters[current_counter_index];
+ }
+ }
+
+ if (n_errors_current_sw_if_index > 0)
+ {
+ vnet_sw_interface_t * si;
+
+ vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index,
+ n_errors_current_sw_if_index);
+
+ si = vnet_get_sw_interface (vnm, current_sw_if_index);
+ if (si->sup_sw_if_index != current_sw_if_index)
+ vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index,
+ n_errors_current_sw_if_index);
+ }
+
+ vlib_error_elog_count (vm, current_counter_index,
+ (current_counter
+ - em->counters[current_counter_index]));
+
+ /* Return cached counter. */
+ em->counters[current_counter_index] = current_counter;
+
+ /* Save memory for next iteration. */
+ memory[disposition] = current_error;
+
+ if (disposition == VNET_ERROR_DISPOSITION_DROP
+ || ! vm->os_punt_frame)
+ {
+ vlib_buffer_free
+ (vm,
+ first_buffer,
+ frame->n_vectors);
+
+ /* If there is no punt function, free the frame as well. */
+ if (disposition == VNET_ERROR_DISPOSITION_PUNT && ! vm->os_punt_frame)
+ vlib_frame_free (vm, node, frame);
+ }
+ else
+ vm->os_punt_frame (vm, node, frame);
+
+ return frame->n_vectors;
+}
+
+static inline void
+pcap_drop_trace (vlib_main_t * vm,
+ vnet_interface_main_t * im,
+ vlib_frame_t * f)
+{
+ u32 * from;
+ u32 n_left = f->n_vectors;
+ vlib_buffer_t * b0, * p1;
+ u32 bi0;
+ i16 save_current_data;
+ u16 save_current_length;
+
+ from = vlib_frame_vector_args (f);
+
+ while (n_left > 0)
+ {
+ if (PREDICT_TRUE (n_left > 1))
+ {
+ p1 = vlib_get_buffer (vm, from[1]);
+ vlib_prefetch_buffer_header (p1, LOAD);
+ }
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ from++;
+ n_left--;
+
+ /* See if we're pointedly ignoring this specific error */
+ if (im->pcap_drop_filter_hash
+ && hash_get (im->pcap_drop_filter_hash, b0->error))
+ continue;
+
+ /* Trace all drops, or drops received on a specific interface */
+ if (im->pcap_sw_if_index == 0 ||
+ im->pcap_sw_if_index == vnet_buffer(b0)->sw_if_index [VLIB_RX])
+ {
+ save_current_data = b0->current_data;
+ save_current_length = b0->current_length;
+
+ /*
+ * Typically, we'll need to rewind the buffer
+ */
+ if (b0->current_data > 0)
+ vlib_buffer_advance (b0, (word) -b0->current_data);
+
+ pcap_add_buffer (&im->pcap_main, vm, bi0, 512);
+
+ b0->current_data = save_current_data;
+ b0->current_length = save_current_length;
+ }
+ }
+}
+
+void vnet_pcap_drop_trace_filter_add_del (u32 error_index, int is_add)
+{
+ vnet_interface_main_t * im = &vnet_get_main()->interface_main;
+
+ if (im->pcap_drop_filter_hash == 0)
+ im->pcap_drop_filter_hash = hash_create (0, sizeof (uword));
+
+ if (is_add)
+ hash_set (im->pcap_drop_filter_hash, error_index, 1);
+ else
+ hash_unset (im->pcap_drop_filter_hash, error_index);
+}
+
+static uword
+process_drop (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_interface_main_t * im = &vnet_get_main()->interface_main;
+
+ if (PREDICT_FALSE (im->drop_pcap_enable))
+ pcap_drop_trace (vm, im, frame);
+
+ return process_drop_punt (vm, node, frame, VNET_ERROR_DISPOSITION_DROP);
+}
+
+static uword
+process_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return process_drop_punt (vm, node, frame, VNET_ERROR_DISPOSITION_PUNT);
+}
+
+VLIB_REGISTER_NODE (drop_buffers,static) = {
+ .function = process_drop,
+ .name = "error-drop",
+ .flags = VLIB_NODE_FLAG_IS_DROP,
+ .vector_size = sizeof (u32),
+ .format_trace = format_vnet_error_trace,
+ .validate_frame = validate_error_frame,
+};
+
+VLIB_REGISTER_NODE (punt_buffers,static) = {
+ .function = process_punt,
+ .flags = (VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH
+ | VLIB_NODE_FLAG_IS_PUNT),
+ .name = "error-punt",
+ .vector_size = sizeof (u32),
+ .format_trace = format_vnet_error_trace,
+ .validate_frame = validate_error_frame,
+};
+
+static clib_error_t *
+vnet_per_buffer_interface_output_hw_interface_add_del (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 is_create);
+
+VLIB_REGISTER_NODE (vnet_per_buffer_interface_output_node,static) = {
+ .function = vnet_per_buffer_interface_output,
+ .name = "interface-output",
+ .vector_size = sizeof (u32),
+};
+
+clib_error_t *
+vnet_per_buffer_interface_output_hw_interface_add_del (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 is_create)
+{
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ u32 next_index;
+
+ next_index = vlib_node_add_next_with_slot
+ (vnm->vlib_main, vnet_per_buffer_interface_output_node.index,
+ hi->output_node_index,
+ /* next_index */ hw_if_index);
+
+ ASSERT (next_index == hw_if_index);
+
+ return 0;
+}
+
+VNET_HW_INTERFACE_ADD_DEL_FUNCTION
+(vnet_per_buffer_interface_output_hw_interface_add_del);
+
+static clib_error_t *
+pcap_drop_trace_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u8 * filename;
+ u32 max;
+ int matched = 0;
+ clib_error_t * error = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "on"))
+ {
+ if (im->drop_pcap_enable == 0)
+ {
+ if (im->pcap_filename == 0)
+ im->pcap_filename = format (0, "/tmp/drop.pcap%c", 0);
+
+ memset (&im->pcap_main, 0, sizeof (im->pcap_main));
+ im->pcap_main.file_name = (char *) im->pcap_filename;
+ im->pcap_main.n_packets_to_capture = 100;
+ if (im->pcap_pkts_to_capture)
+ im->pcap_main.n_packets_to_capture = im->pcap_pkts_to_capture;
+
+ im->pcap_main.packet_type = PCAP_PACKET_TYPE_ethernet;
+ im->drop_pcap_enable = 1;
+ matched = 1;
+ vlib_cli_output (vm, "pcap drop capture on...");
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap drop capture already on...");
+ }
+ matched = 1;
+ }
+ else if (unformat (input, "off"))
+ {
+ matched = 1;
+
+ if (im->drop_pcap_enable)
+ {
+ vlib_cli_output (vm, "captured %d pkts...",
+ im->pcap_main.n_packets_captured);
+ if (im->pcap_main.n_packets_captured)
+ {
+ im->pcap_main.n_packets_to_capture =
+ im->pcap_main.n_packets_captured;
+ error = pcap_write (&im->pcap_main);
+ if (error)
+ clib_error_report (error);
+ else
+ vlib_cli_output (vm, "saved to %s...", im->pcap_filename);
+ }
+ }
+ else
+ {
+ vlib_cli_output (vm, "pcap drop capture already off...");
+ }
+
+ im->drop_pcap_enable = 0;
+ }
+ else if (unformat (input, "max %d", &max))
+ {
+ im->pcap_pkts_to_capture = max;
+ matched = 1;
+ }
+
+ else if (unformat (input, "intfc %U",
+ unformat_vnet_sw_interface, vnm,
+ &im->pcap_sw_if_index))
+ matched = 1;
+ else if (unformat (input, "intfc any"))
+ {
+ im->pcap_sw_if_index = 0;
+ matched = 1;
+ }
+ else if (unformat (input, "file %s", &filename))
+ {
+ u8 * chroot_filename;
+ /* Brain-police user path input */
+ if (strstr((char *)filename, "..") || index((char *)filename, '/'))
+ {
+ vlib_cli_output (vm, "illegal characters in filename '%s'",
+ filename);
+ continue;
+ }
+
+ chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+ vec_free (filename);
+
+ if (im->pcap_filename)
+ vec_free (im->pcap_filename);
+ vec_add1 (filename, 0);
+ im->pcap_filename = chroot_filename;
+ matched = 1;
+ }
+ else if (unformat (input, "status"))
+ {
+ if (im->drop_pcap_enable == 0)
+ {
+ vlib_cli_output (vm, "pcap drop capture is off...");
+ continue;
+ }
+
+ vlib_cli_output (vm, "pcap drop capture: %d of %d pkts...",
+ im->pcap_main.n_packets_captured,
+ im->pcap_main.n_packets_to_capture);
+ matched = 1;
+ }
+
+ else
+ break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (pcap_trace_command, static) = {
+ .path = "pcap drop trace",
+ .short_help =
+ "pcap drop trace on off max <nn> intfc <intfc> file <name> status",
+ .function = pcap_drop_trace_command_fn,
+};
diff --git a/vnet/vnet/ip/format.c b/vnet/vnet/ip/format.c
new file mode 100644
index 00000000000..9dda4c5e10b
--- /dev/null
+++ b/vnet/vnet/ip/format.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip_format.c: ip generic (4 or 6) formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+/* Format IP protocol. */
+u8 * format_ip_protocol (u8 * s, va_list * args)
+{
+ ip_protocol_t protocol = va_arg (*args, ip_protocol_t);
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi = ip_get_protocol_info (im, protocol);
+
+ if (pi)
+ return format (s, "%s", pi->name);
+ else
+ return format (s, "unknown %d", protocol);
+}
+
+uword unformat_ip_protocol (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+ int i;
+
+ if (! unformat_user (input, unformat_vlib_number_by_name,
+ im->protocol_info_by_name, &i))
+ return 0;
+
+ pi = vec_elt_at_index (im->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+}
+
+u8 * format_tcp_udp_port (u8 * s, va_list * args)
+{
+ int port = va_arg (*args, int);
+ ip_main_t * im = &ip_main;
+ tcp_udp_port_info_t * pi;
+
+ pi = ip_get_tcp_udp_port_info (im, port);
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "%d", clib_net_to_host_u16 (port));
+
+ return s;
+}
+
+uword unformat_tcp_udp_port (unformat_input_t * input, va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ ip_main_t * im = &ip_main;
+ tcp_udp_port_info_t * pi;
+ u32 i, port;
+
+
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ im->port_info_by_name, &i))
+ {
+ pi = vec_elt_at_index (im->port_infos, i);
+ port = pi->port;
+ }
+ else if (unformat_user (input, unformat_vlib_number, &port)
+ && port < (1 << 16))
+ port = clib_host_to_net_u16 (port);
+
+ else
+ return 0;
+
+ *result = port;
+ return 1;
+}
+
+uword unformat_ip46_address (unformat_input_t * input, va_list * args)
+{
+ ip46_address_t * a = va_arg (*args, ip46_address_t *);
+ u32 is_ip6 = va_arg (*args, u32);
+ if (is_ip6)
+ return unformat_user (input, unformat_ip6_address, &a->ip6);
+ else
+ return unformat_user (input, unformat_ip4_address, &a->ip4);
+}
diff --git a/vnet/vnet/ip/format.h b/vnet/vnet/ip/format.h
new file mode 100644
index 00000000000..511a9346bf6
--- /dev/null
+++ b/vnet/vnet/ip/format.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/format.h: ip 4 and/or 6 formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_format_h
+#define included_ip_format_h
+
+/* IP4 or IP6. */
+
+format_function_t format_ip_protocol;
+unformat_function_t unformat_ip_protocol;
+
+format_function_t format_tcp_udp_port;
+unformat_function_t unformat_tcp_udp_port;
+
+format_function_t format_ip_adjacency;
+format_function_t format_ip_adjacency_packet_data;
+
+unformat_function_t unformat_ip46_address;
+
+/* IP4 */
+
+/* Parse an IP4 address %d.%d.%d.%d. */
+unformat_function_t unformat_ip4_address;
+
+/* Format an IP4 address. */
+format_function_t format_ip4_address;
+format_function_t format_ip4_address_and_length;
+
+/* Parse an IP4 header. */
+unformat_function_t unformat_ip4_header;
+
+/* Format an IP4 header. */
+format_function_t format_ip4_header;
+
+/* Parse an IP packet matching pattern. */
+unformat_function_t unformat_ip4_match;
+
+unformat_function_t unformat_pg_ip4_header;
+
+/* IP6 */
+unformat_function_t unformat_ip6_address;
+format_function_t format_ip6_address;
+format_function_t format_ip6_address_and_length;
+unformat_function_t unformat_ip6_header;
+format_function_t format_ip6_header;
+unformat_function_t unformat_pg_ip6_header;
+
+/* Format a TCP/UDP headers. */
+format_function_t format_tcp_header, format_udp_header;
+
+unformat_function_t unformat_pg_tcp_header, unformat_pg_udp_header;
+
+#endif /* included_ip_format_h */
diff --git a/vnet/vnet/ip/icmp4.c b/vnet/vnet/ip/icmp4.c
new file mode 100644
index 00000000000..e21f3bf047b
--- /dev/null
+++ b/vnet/vnet/ip/icmp4.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/icmp4.c: ipv4 icmp
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+static u8 * format_ip4_icmp_type_and_code (u8 * s, va_list * args)
+{
+ icmp4_type_t type = va_arg (*args, int);
+ u8 code = va_arg (*args, int);
+ char * t = 0;
+
+#define _(n,f) case n: t = #f; break;
+
+ switch (type)
+ {
+ foreach_icmp4_type;
+
+ default:
+ break;
+ }
+
+#undef _
+
+ if (! t)
+ return format (s, "unknown 0x%x", type);
+
+ s = format (s, "%s", t);
+
+ t = 0;
+ switch ((type << 8) | code)
+ {
+#define _(a,n,f) case (ICMP4_##a << 8) | (n): t = #f; break;
+
+ foreach_icmp4_code;
+
+#undef _
+ }
+
+ if (t)
+ s = format (s, " %s", t);
+
+ return s;
+}
+
+static u8 * format_ip4_icmp_header (u8 * s, va_list * args)
+{
+ icmp46_header_t * icmp = va_arg (*args, icmp46_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (icmp[0]))
+ return format (s, "ICMP header truncated");
+
+ s = format (s, "ICMP %U checksum 0x%x",
+ format_ip4_icmp_type_and_code, icmp->type, icmp->code,
+ clib_net_to_host_u16 (icmp->checksum));
+
+ return s;
+}
+
+typedef struct {
+ u8 packet_data[64];
+} icmp_input_trace_t;
+
+static u8 * format_icmp_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ icmp_input_trace_t * t = va_arg (*va, icmp_input_trace_t *);
+
+ s = format (s, "%U",
+ format_ip4_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+typedef enum {
+ ICMP4_ERROR_UNKNOWN_TYPE,
+ ICMP4_ERROR_ECHO_REPLIES_SENT,
+ ICMP4_ERROR_TTL_EXPIRE_RESP_SENT,
+ ICMP4_ERROR_TTL_EXPIRE_RESP_DROP,
+} icmp_error_t;
+
+static char * icmp_error_strings[] = {
+ [ICMP4_ERROR_UNKNOWN_TYPE] = "unknown type",
+ [ICMP4_ERROR_ECHO_REPLIES_SENT] = "echo replies sent",
+ [ICMP4_ERROR_TTL_EXPIRE_RESP_SENT] = "TTL time exceeded response sent",
+ [ICMP4_ERROR_TTL_EXPIRE_RESP_DROP] = "TTL time exceeded response dropped",
+};
+
+typedef enum {
+ ICMP_INPUT_NEXT_ERROR,
+ ICMP_INPUT_N_NEXT,
+} icmp_input_next_t;
+
+typedef struct {
+ uword * type_and_code_by_name;
+
+ uword * type_by_name;
+
+ /* Vector dispatch table indexed by [icmp type]. */
+ u8 ip4_input_next_index_by_type[256];
+} icmp4_main_t;
+
+icmp4_main_t icmp4_main;
+
+static uword
+ip4_icmp_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ icmp4_main_t * im = &icmp4_main;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ icmp46_header_t * icmp0;
+ icmp4_type_t type0;
+ u32 bi0, next0;
+
+ if (PREDICT_TRUE (n_left_from > 2))
+ {
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ p0 = vlib_get_buffer (vm, from[1]);
+ ip0 = vlib_buffer_get_current (p0);
+ CLIB_PREFETCH(ip0, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ icmp0 = ip4_next_header (ip0);
+ type0 = icmp0->type;
+ next0 = im->ip4_input_next_index_by_type[type0];
+
+ p0->error = node->errors[ICMP4_ERROR_UNKNOWN_TYPE];
+ if (PREDICT_FALSE (next0 != next))
+ {
+ vlib_put_next_frame (vm, node, next, n_left_to_next + 1);
+ next = next0;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_icmp_input_node,static) = {
+ .function = ip4_icmp_input,
+ .name = "ip4-icmp-input",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp_input_trace,
+
+ .n_errors = ARRAY_LEN (icmp_error_strings),
+ .error_strings = icmp_error_strings,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [ICMP_INPUT_NEXT_ERROR] = "error-punt",
+ },
+};
+
+static uword
+ip4_icmp_echo_request (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next;
+ ip4_main_t * i4m = &ip4_main;
+ u16 * fragment_ids, * fid;
+ u8 host_config_ttl = i4m->host_config.ttl;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp_input_trace_t));
+
+ /* Get random fragment IDs for replies. */
+ fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer,
+ n_packets * sizeof (fragment_ids[0]));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 2 && n_left_to_next > 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ icmp46_header_t * icmp0, * icmp1;
+ u32 bi0, src0, dst0;
+ u32 bi1, src1, dst1;
+ ip_csum_t sum0, sum1;
+
+ bi0 = to_next[0] = from[0];
+ bi1 = to_next[1] = from[1];
+
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ p1 = vlib_get_buffer (vm, bi1);
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+ icmp0 = ip4_next_header (ip0);
+ icmp1 = ip4_next_header (ip1);
+
+ vnet_buffer (p0)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index;
+ vnet_buffer (p1)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index;
+
+ /* Update ICMP checksum. */
+ sum0 = icmp0->checksum;
+ sum1 = icmp1->checksum;
+
+ ASSERT (icmp0->type == ICMP4_echo_request);
+ ASSERT (icmp1->type == ICMP4_echo_request);
+ sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply,
+ icmp46_header_t, type);
+ sum1 = ip_csum_update (sum1, ICMP4_echo_request, ICMP4_echo_reply,
+ icmp46_header_t, type);
+ icmp0->type = ICMP4_echo_reply;
+ icmp1->type = ICMP4_echo_reply;
+
+ icmp0->checksum = ip_csum_fold (sum0);
+ icmp1->checksum = ip_csum_fold (sum1);
+
+ src0 = ip0->src_address.data_u32;
+ src1 = ip1->src_address.data_u32;
+ dst0 = ip0->dst_address.data_u32;
+ dst1 = ip1->dst_address.data_u32;
+
+ /* Swap source and destination address.
+ Does not change checksum. */
+ ip0->src_address.data_u32 = dst0;
+ ip1->src_address.data_u32 = dst1;
+ ip0->dst_address.data_u32 = src0;
+ ip1->dst_address.data_u32 = src1;
+
+ /* Update IP checksum. */
+ sum0 = ip0->checksum;
+ sum1 = ip1->checksum;
+
+ sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl,
+ ip4_header_t, ttl);
+ sum1 = ip_csum_update (sum1, ip1->ttl, host_config_ttl,
+ ip4_header_t, ttl);
+ ip0->ttl = host_config_ttl;
+ ip1->ttl = host_config_ttl;
+
+ /* New fragment id. */
+ sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0],
+ ip4_header_t, fragment_id);
+ sum1 = ip_csum_update (sum1, ip1->fragment_id, fid[1],
+ ip4_header_t, fragment_id);
+ ip0->fragment_id = fid[0];
+ ip1->fragment_id = fid[1];
+ fid += 2;
+
+ ip0->checksum = ip_csum_fold (sum0);
+ ip1->checksum = ip_csum_fold (sum1);
+
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+ ASSERT (ip1->checksum == ip4_header_checksum (ip1));
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ icmp46_header_t * icmp0;
+ u32 bi0, src0, dst0;
+ ip_csum_t sum0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ icmp0 = ip4_next_header (ip0);
+
+ vnet_buffer (p0)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index;
+
+ /* Update ICMP checksum. */
+ sum0 = icmp0->checksum;
+
+ ASSERT (icmp0->type == ICMP4_echo_request);
+ sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply,
+ icmp46_header_t, type);
+ icmp0->type = ICMP4_echo_reply;
+ icmp0->checksum = ip_csum_fold (sum0);
+
+ src0 = ip0->src_address.data_u32;
+ dst0 = ip0->dst_address.data_u32;
+ ip0->src_address.data_u32 = dst0;
+ ip0->dst_address.data_u32 = src0;
+
+ /* Update IP checksum. */
+ sum0 = ip0->checksum;
+
+ sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl,
+ ip4_header_t, ttl);
+ ip0->ttl = host_config_ttl;
+
+ sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0],
+ ip4_header_t, fragment_id);
+ ip0->fragment_id = fid[0];
+ fid += 1;
+
+ ip0->checksum = ip_csum_fold (sum0);
+
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ vlib_error_count (vm, ip4_icmp_input_node.index,
+ ICMP4_ERROR_ECHO_REPLIES_SENT,
+ frame->n_vectors);
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_icmp_echo_request_node,static) = {
+ .function = ip4_icmp_echo_request,
+ .name = "ip4-icmp-echo-request",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp_input_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "ip4-rewrite-local",
+ },
+};
+
+typedef enum {
+ ICMP4_TTL_EXPIRE_NEXT_DROP,
+ ICMP4_TTL_EXPIRE_NEXT_LOOKUP,
+ ICMP4_TTL_EXPIRE_N_NEXT,
+} icmp_ttl_expire_next_t;
+
+static uword
+ip4_icmp_ttl_expire (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from, * to_next;
+ uword n_left_from, n_left_to_next;
+ icmp_ttl_expire_next_t next_index;
+ ip4_main_t *im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1, sizeof (icmp_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 pi0 = from[0];
+ u32 next0 = ICMP4_TTL_EXPIRE_NEXT_LOOKUP;
+ u8 error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_SENT;
+ u32 len0, new_len0;
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0, * out_ip0;
+ icmp46_header_t * icmp0;
+ ip_csum_t sum;
+ u32 sw_if_index0, if_add_index0;
+
+ /* Speculatively enqueue p0 to the current next frame */
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip0 = vlib_buffer_get_current(p0);
+ len0 = vlib_buffer_length_in_chain (vm, p0);
+ sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX];
+
+ /* Cut payload to just IP header plus first 8 bytes */
+ new_len0 = (ip0->ip_version_and_header_length &0xf)*4 + 8;
+ if (len0 > new_len0)
+ {
+ p0->current_length = new_len0; /* should fit in 1st buffer */
+ if (PREDICT_FALSE(p0->total_length_not_including_first_buffer))
+ { /* clear current_length of all other buffers in chain */
+ vlib_buffer_t *b = p0;
+ p0->total_length_not_including_first_buffer = 0;
+ while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ b = vlib_get_buffer (vm, b->next_buffer);
+ b->current_length = 0;
+ }
+ }
+ }
+
+ /* Add IP header and ICMP header including a 4 byte unused field */
+ vlib_buffer_advance(p0,
+ -sizeof(ip4_header_t)-sizeof(icmp46_header_t)-4);
+ out_ip0 = vlib_buffer_get_current(p0);
+ icmp0 = (icmp46_header_t *) &out_ip0[1];
+
+ /* Fill ip header fields */
+ out_ip0->ip_version_and_header_length = 0x45;
+ out_ip0->tos = 0;
+ out_ip0->length = clib_host_to_net_u16(p0->current_length);
+ out_ip0->fragment_id = 0;
+ out_ip0->ttl = 0xff;
+ out_ip0->protocol = IP_PROTOCOL_ICMP;
+ out_ip0->dst_address = ip0->src_address;
+ if_add_index0 =
+ lm->if_address_pool_index_by_sw_if_index[sw_if_index0];
+ if (PREDICT_TRUE(if_add_index0 != ~0))
+ {
+ ip_interface_address_t *if_add =
+ pool_elt_at_index(lm->if_address_pool, if_add_index0);
+ ip4_address_t *if_ip =
+ ip_interface_address_get_address(lm, if_add);
+ out_ip0->src_address = *if_ip;
+ vlib_error_count (vm, node->node_index, error0, 1);
+ }
+ else /* interface has no IP4 address - should not happen */
+ {
+ next0 = ICMP4_TTL_EXPIRE_NEXT_DROP;
+ error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_DROP;
+ }
+ out_ip0->checksum = ip4_header_checksum(out_ip0);
+
+ /* Fill icmp header fields */
+ icmp0->type = ICMP4_time_exceeded;
+ icmp0->code = ICMP4_time_exceeded_ttl_exceeded_in_transit;
+ icmp0->checksum = 0;
+ sum = ip_incremental_checksum(
+ 0, icmp0, p0->current_length - sizeof(ip4_header_t));
+ icmp0->checksum = ~ip_csum_fold(sum);
+
+ /* Update error status */
+ p0->error = node->errors[error0];
+
+ /* Verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_icmp_ttl_expire_node) = {
+ .function = ip4_icmp_ttl_expire,
+ .name = "ip4-icmp-ttl-expire",
+ .vector_size = sizeof (u32),
+
+ .n_errors = ARRAY_LEN (icmp_error_strings),
+ .error_strings = icmp_error_strings,
+
+ .n_next_nodes = ICMP4_TTL_EXPIRE_N_NEXT,
+ .next_nodes = {
+ [ICMP4_TTL_EXPIRE_NEXT_DROP] = "error-drop",
+ [ICMP4_TTL_EXPIRE_NEXT_LOOKUP] = "ip4-lookup",
+ },
+
+ .format_trace = format_icmp_input_trace,
+};
+
+
+static uword unformat_icmp_type_and_code (unformat_input_t * input, va_list * args)
+{
+ icmp46_header_t * h = va_arg (*args, icmp46_header_t *);
+ icmp4_main_t * cm = &icmp4_main;
+ u32 i;
+
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ cm->type_and_code_by_name, &i))
+ {
+ h->type = (i >> 8) & 0xff;
+ h->code = (i >> 0) & 0xff;
+ }
+ else if (unformat_user (input, unformat_vlib_number_by_name,
+ cm->type_by_name, &i))
+ {
+ h->type = i;
+ h->code = 0;
+ }
+ else
+ return 0;
+
+ return 1;
+}
+
+static void
+icmp4_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_offset, icmp_offset;
+
+ icmp_offset = g->start_byte_offset;
+ ip_offset = (g-1)->start_byte_offset;
+
+ while (n_packets >= 1)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ icmp46_header_t * icmp0;
+ u32 len0;
+
+ p0 = vlib_get_buffer (vm, packets[0]);
+ n_packets -= 1;
+ packets += 1;
+
+ ASSERT (p0->current_data == 0);
+ ip0 = (void *) (p0->data + ip_offset);
+ icmp0 = (void *) (p0->data + icmp_offset);
+ len0 = clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0);
+ icmp0->checksum = ~ ip_csum_fold (ip_incremental_checksum (0, icmp0, len0));
+ }
+}
+
+typedef struct {
+ pg_edit_t type, code;
+ pg_edit_t checksum;
+} pg_icmp46_header_t;
+
+always_inline void
+pg_icmp_header_init (pg_icmp46_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, icmp46_header_t, f);
+ _ (type);
+ _ (code);
+ _ (checksum);
+#undef _
+}
+
+static uword
+unformat_pg_icmp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_icmp46_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (icmp46_header_t),
+ &group_index);
+ pg_icmp_header_init (p);
+
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+
+ {
+ icmp46_header_t tmp;
+
+ if (! unformat (input, "ICMP %U", unformat_icmp_type_and_code, &tmp))
+ goto error;
+
+ pg_edit_set_fixed (&p->type, tmp.type);
+ pg_edit_set_fixed (&p->code, tmp.code);
+ }
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "checksum %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->checksum))
+ ;
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = icmp4_pg_edit_function;
+ g->edit_function_opaque = 0;
+ }
+
+ return 1;
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
+void ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type,
+ u32 node_index)
+{
+ icmp4_main_t * im = &icmp4_main;
+
+ ASSERT (type < ARRAY_LEN (im->ip4_input_next_index_by_type));
+ im->ip4_input_next_index_by_type[type]
+ = vlib_node_add_next (vm, ip4_icmp_input_node.index, node_index);
+}
+
+static clib_error_t *
+icmp4_init (vlib_main_t * vm)
+{
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+ icmp4_main_t * cm = &icmp4_main;
+ clib_error_t * error;
+
+ error = vlib_call_init_function (vm, ip_main_init);
+
+ if (error)
+ return error;
+
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_ICMP);
+ pi->format_header = format_ip4_icmp_header;
+ pi->unformat_pg_edit = unformat_pg_icmp_header;
+
+ cm->type_by_name = hash_create_string (0, sizeof (uword));
+#define _(n,t) hash_set_mem (cm->type_by_name, #t, (n));
+ foreach_icmp4_type;
+#undef _
+
+ cm->type_and_code_by_name = hash_create_string (0, sizeof (uword));
+#define _(a,n,t) hash_set_mem (cm->type_by_name, #t, (n) | (ICMP4_##a << 8));
+ foreach_icmp4_code;
+#undef _
+
+ memset (cm->ip4_input_next_index_by_type,
+ ICMP_INPUT_NEXT_ERROR,
+ sizeof (cm->ip4_input_next_index_by_type));
+
+ ip4_icmp_register_type (vm, ICMP4_echo_request, ip4_icmp_echo_request_node.index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (icmp4_init);
diff --git a/vnet/vnet/ip/icmp46_packet.h b/vnet/vnet/ip/icmp46_packet.h
new file mode 100644
index 00000000000..fa3fed4d081
--- /dev/null
+++ b/vnet/vnet/ip/icmp46_packet.h
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * icmp46_packet.h: ip4/ip6 icmp packet format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_icmp46_packet_h
+#define included_vnet_icmp46_packet_h
+
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip6_packet.h>
+
+#define foreach_icmp4_type \
+ _ (0, echo_reply) \
+ _ (3, destination_unreachable) \
+ _ (4, source_quench) \
+ _ (5, redirect) \
+ _ (6, alternate_host_address) \
+ _ (8, echo_request) \
+ _ (9, router_advertisement) \
+ _ (10, router_solicitation) \
+ _ (11, time_exceeded) \
+ _ (12, parameter_problem) \
+ _ (13, timestamp_request) \
+ _ (14, timestamp_reply) \
+ _ (15, information_request) \
+ _ (16, information_reply) \
+ _ (17, address_mask_request) \
+ _ (18, address_mask_reply) \
+ _ (30, traceroute) \
+ _ (31, datagram_conversion_error) \
+ _ (32, mobile_host_redirect) \
+ _ (33, ip6_where_are_you) \
+ _ (34, ip6_i_am_here) \
+ _ (35, mobile_registration_request) \
+ _ (36, mobile_registration_reply) \
+ _ (37, domain_name_request) \
+ _ (38, domain_name_reply) \
+ _ (39, skip) \
+ _ (40, photuris)
+
+#define icmp_no_code 0
+
+#define foreach_icmp4_code \
+ _ (destination_unreachable, 0, destination_unreachable_net) \
+ _ (destination_unreachable, 1, destination_unreachable_host) \
+ _ (destination_unreachable, 2, protocol_unreachable) \
+ _ (destination_unreachable, 3, port_unreachable) \
+ _ (destination_unreachable, 4, fragmentation_needed_and_dont_fragment_set) \
+ _ (destination_unreachable, 5, source_route_failed) \
+ _ (destination_unreachable, 6, destination_network_unknown) \
+ _ (destination_unreachable, 7, destination_host_unknown) \
+ _ (destination_unreachable, 8, source_host_isolated) \
+ _ (destination_unreachable, 9, network_administratively_prohibited) \
+ _ (destination_unreachable, 10, host_administratively_prohibited) \
+ _ (destination_unreachable, 11, network_unreachable_for_type_of_service) \
+ _ (destination_unreachable, 12, host_unreachable_for_type_of_service) \
+ _ (destination_unreachable, 13, communication_administratively_prohibited) \
+ _ (destination_unreachable, 14, host_precedence_violation) \
+ _ (destination_unreachable, 15, precedence_cutoff_in_effect) \
+ _ (redirect, 0, network_redirect) \
+ _ (redirect, 1, host_redirect) \
+ _ (redirect, 2, type_of_service_and_network_redirect) \
+ _ (redirect, 3, type_of_service_and_host_redirect) \
+ _ (router_advertisement, 0, normal_router_advertisement) \
+ _ (router_advertisement, 16, does_not_route_common_traffic) \
+ _ (time_exceeded, 0, ttl_exceeded_in_transit) \
+ _ (time_exceeded, 1, fragment_reassembly_time_exceeded) \
+ _ (parameter_problem, 0, pointer_indicates_error) \
+ _ (parameter_problem, 1, missing_required_option) \
+ _ (parameter_problem, 2, bad_length)
+
+/* ICMPv6 */
+#define foreach_icmp6_type \
+ _ (1, destination_unreachable) \
+ _ (2, packet_too_big) \
+ _ (3, time_exceeded) \
+ _ (4, parameter_problem) \
+ _ (128, echo_request) \
+ _ (129, echo_reply) \
+ _ (130, multicast_listener_request) \
+ _ (131, multicast_listener_report) \
+ _ (132, multicast_listener_done) \
+ _ (133, router_solicitation) \
+ _ (134, router_advertisement) \
+ _ (135, neighbor_solicitation) \
+ _ (136, neighbor_advertisement) \
+ _ (137, redirect) \
+ _ (138, router_renumbering) \
+ _ (139, node_information_request) \
+ _ (140, node_information_response) \
+ _ (141, inverse_neighbor_solicitation) \
+ _ (142, inverse_neighbor_advertisement) \
+ _ (143, multicast_listener_report_v2) \
+ _ (144, home_agent_address_discovery_request) \
+ _ (145, home_agent_address_discovery_reply) \
+ _ (146, mobile_prefix_solicitation) \
+ _ (147, mobile_prefix_advertisement) \
+ _ (148, certification_path_solicitation) \
+ _ (149, certification_path_advertisement) \
+ _ (151, multicast_router_advertisement) \
+ _ (152, multicast_router_solicitation) \
+ _ (153, multicast_router_termination) \
+ _ (154, fmipv6_messages)
+
+#define foreach_icmp6_code \
+ _ (destination_unreachable, 0, no_route_to_destination) \
+ _ (destination_unreachable, 1, destination_administratively_prohibited) \
+ _ (destination_unreachable, 2, beyond_scope_of_source_address) \
+ _ (destination_unreachable, 3, address_unreachable) \
+ _ (destination_unreachable, 4, port_unreachable) \
+ _ (destination_unreachable, 5, source_address_failed_policy) \
+ _ (destination_unreachable, 6, reject_route_to_destination) \
+ _ (time_exceeded, 0, ttl_exceeded_in_transit) \
+ _ (time_exceeded, 1, fragment_reassembly_time_exceeded) \
+ _ (parameter_problem, 0, erroneous_header_field) \
+ _ (parameter_problem, 1, unrecognized_next_header) \
+ _ (parameter_problem, 2, unrecognized_option) \
+ _ (router_renumbering, 0, command) \
+ _ (router_renumbering, 1, result) \
+ _ (node_information_request, 0, data_contains_ip6_address) \
+ _ (node_information_request, 1, data_contains_name) \
+ _ (node_information_request, 2, data_contains_ip4_address) \
+ _ (node_information_response, 0, success) \
+ _ (node_information_response, 1, failed) \
+ _ (node_information_response, 2, unknown_request)
+
+typedef enum {
+#define _(n,f) ICMP4_##f = n,
+ foreach_icmp4_type
+#undef _
+} icmp4_type_t;
+
+typedef enum {
+#define _(t,n,f) ICMP4_##t##_##f = n,
+ foreach_icmp4_code
+#undef _
+} icmp4_code_t;
+
+typedef enum {
+#define _(n,f) ICMP6_##f = n,
+ foreach_icmp6_type
+#undef _
+} icmp6_type_t;
+
+typedef enum {
+#define _(t,n,f) ICMP6_##t##_##f = n,
+ foreach_icmp6_code
+#undef _
+} icmp6_code_t;
+
+typedef CLIB_PACKED (struct {
+ u8 type;
+
+ u8 code;
+
+ /* IP checksum of icmp header plus data which follows. */
+ u16 checksum;
+}) icmp46_header_t;
+
+/* ip6 neighbor discovery */
+#define foreach_icmp6_neighbor_discovery_option \
+ _ (1, source_link_layer_address) \
+ _ (2, target_link_layer_address) \
+ _ (3, prefix_information) \
+ _ (4, redirected_header) \
+ _ (5, mtu) \
+ _ (6, nbma_shortcut_limit) \
+ _ (7, advertisement_interval) \
+ _ (8, home_agent_information) \
+ _ (9, source_address_list) \
+ _ (10, target_address_list) \
+ _ (11, cryptographically_generated_address) \
+ _ (12, rsa_signature) \
+ _ (13, timestamp) \
+ _ (14, nonce) \
+ _ (15, trust_anchor) \
+ _ (16, certificate) \
+ _ (17, ip_address_and_prefix) \
+ _ (18, new_router_prefix_information) \
+ _ (19, mobile_link_layer_address) \
+ _ (20, neighbor_advertisement_acknowledgment) \
+ _ (23, map) \
+ _ (24, route_information) \
+ _ (25, recursive_dns_server) \
+ _ (26, ra_flags_extension) \
+ _ (27, handover_key_request) \
+ _ (28, handover_key_reply) \
+ _ (29, handover_assist_information) \
+ _ (30, mobile_node_identifier) \
+ _ (31, dns_search_list) \
+ _ (138, card_request) \
+ _ (139, card_reply)
+
+typedef enum icmp6_neighbor_discovery_option_type {
+#define _(n,f) ICMP6_NEIGHBOR_DISCOVERY_OPTION_##f = n,
+ foreach_icmp6_neighbor_discovery_option
+#undef _
+} icmp6_neighbor_discovery_option_type_t;
+
+typedef CLIB_PACKED (struct {
+ /* Option type. */
+ u8 type;
+
+ /* Length of this header plus option data in 8 byte units. */
+ u8 n_data_u64s;
+
+ /* Option data follows. */
+ u8 data[0];
+}) icmp6_neighbor_discovery_option_header_t;
+
+typedef CLIB_PACKED (struct {
+ icmp6_neighbor_discovery_option_header_t header;
+ u8 dst_address_length;
+ u8 flags;
+#define ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_FLAG_ON_LINK (1 << 7)
+#define ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_AUTO (1 << 6)
+ u32 valid_time;
+ u32 preferred_time;
+ u32 unused;
+ ip6_address_t dst_address;
+}) icmp6_neighbor_discovery_prefix_information_option_t;
+
+typedef CLIB_PACKED (struct {
+ u8 type;
+ u8 aux_data_len_u32s;
+ u16 num_sources;
+ ip6_address_t mcast_addr;
+ ip6_address_t source_addr[0];
+}) icmp6_multicast_address_record_t;
+
+typedef CLIB_PACKED (struct {
+ ip6_hop_by_hop_ext_t ext_hdr;
+ ip6_router_alert_option_t alert;
+ ip6_padN_option_t pad;
+ icmp46_header_t icmp;
+ u16 rsvd;
+ u16 num_addr_records;
+ icmp6_multicast_address_record_t records[0];
+}) icmp6_multicast_listener_report_header_t;
+
+typedef CLIB_PACKED (struct {
+ icmp6_neighbor_discovery_option_header_t header;
+ u8 reserved[6];
+ /* IP6 header plus payload follows. */
+ u8 data[0];
+}) icmp6_neighbor_discovery_redirected_header_option_t;
+
+typedef CLIB_PACKED (struct {
+ icmp6_neighbor_discovery_option_header_t header;
+ u16 unused;
+ u32 mtu;
+}) icmp6_neighbor_discovery_mtu_option_t;
+
+typedef CLIB_PACKED (struct {
+ icmp6_neighbor_discovery_option_header_t header;
+ u8 ethernet_address[6];
+}) icmp6_neighbor_discovery_ethernet_link_layer_address_option_t;
+
+typedef CLIB_PACKED (struct {
+ icmp6_neighbor_discovery_option_header_t header;
+ u8 max_l2_address[6+8];
+}) icmp6_neighbor_discovery_max_link_layer_address_option_t;
+
+/* Generic neighbor discover header. Used for router solicitations,
+ etc. */
+typedef CLIB_PACKED (struct {
+ icmp46_header_t icmp;
+
+ u32 reserved_must_be_zero;
+}) icmp6_neighbor_discovery_header_t;
+
+/* Router advertisement packet formats. */
+typedef CLIB_PACKED (struct {
+ icmp46_header_t icmp;
+
+ /* Current hop limit to use for outgoing packets. */
+ u8 current_hop_limit;
+
+ u8 flags;
+#define ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP (1 << 7)
+#define ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP (1 << 6)
+
+ /* Zero means unspecified. */
+ u16 router_lifetime_in_sec;
+
+ /* Zero means unspecified. */
+ u32 neighbor_reachable_time_in_msec;
+
+ /* Zero means unspecified. */
+ u32 time_in_msec_between_retransmitted_neighbor_solicitations;
+
+ /* Options that may follow: source_link_layer_address, mtu, prefix_information. */
+}) icmp6_router_advertisement_header_t;
+
+/* Neighbor solicitation/advertisement header. */
+typedef CLIB_PACKED (struct {
+ icmp46_header_t icmp;
+
+ /* Zero for solicitation; flags for advertisement. */
+ u32 advertisement_flags;
+ /* Set when sent by a router. */
+#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_ROUTER (1 << 31)
+ /* Set when response to solicitation. */
+#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_SOLICITED (1 << 30)
+#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_OVERRIDE (1 << 29)
+
+ ip6_address_t target_address;
+
+ /* Options that may follow: source_link_layer_address
+ (for solicitation) target_link_layer_address (for advertisement). */
+}) icmp6_neighbor_solicitation_or_advertisement_header_t;
+
+typedef CLIB_PACKED (struct {
+ icmp46_header_t icmp;
+
+ u32 reserved_must_be_zero;
+
+ /* Better next hop to use for given destination. */
+ ip6_address_t better_next_hop_address;
+
+ ip6_address_t dst_address;
+
+ /* Options that may follow: target_link_layer_address,
+ redirected_header. */
+}) icmp6_redirect_header_t;
+
+/* Solicitation/advertisement packet format for ethernet. */
+typedef CLIB_PACKED (struct {
+ ip6_header_t ip;
+
+ icmp6_neighbor_solicitation_or_advertisement_header_t neighbor;
+
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option;
+}) icmp6_neighbor_solicitation_header_t;
+
+/* Router solicitation packet format for ethernet. */
+typedef CLIB_PACKED (struct {
+ ip6_header_t ip;
+ icmp6_neighbor_discovery_header_t neighbor;
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option;
+}) icmp6_router_solicitation_header_t;
+
+/* router advertisement packet format for ethernet. */
+typedef CLIB_PACKED (struct {
+ ip6_header_t ip;
+ icmp6_router_advertisement_header_t router;
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option;
+ icmp6_neighbor_discovery_mtu_option_t mtu_option;
+ icmp6_neighbor_discovery_prefix_information_option_t prefix[0];
+}) icmp6_router_advertisement_packet_t;
+
+/* multicast listener report packet format for ethernet. */
+typedef CLIB_PACKED (struct {
+ ip6_header_t ip;
+ icmp6_multicast_listener_report_header_t report_hdr;
+}) icmp6_multicast_listener_report_packet_t;
+
+#endif /* included_vnet_icmp46_packet_h */
diff --git a/vnet/vnet/ip/icmp6.c b/vnet/vnet/ip/icmp6.c
new file mode 100644
index 00000000000..2d265d2b5b2
--- /dev/null
+++ b/vnet/vnet/ip/icmp6.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/icmp6.c: ip6 icmp
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+static u8 * format_ip6_icmp_type_and_code (u8 * s, va_list * args)
+{
+ icmp6_type_t type = va_arg (*args, int);
+ u8 code = va_arg (*args, int);
+ char * t = 0;
+
+#define _(n,f) case n: t = #f; break;
+
+ switch (type)
+ {
+ foreach_icmp6_type;
+
+ default:
+ break;
+ }
+
+#undef _
+
+ if (! t)
+ return format (s, "unknown 0x%x", type);
+
+ s = format (s, "%s", t);
+
+ t = 0;
+ switch ((type << 8) | code)
+ {
+#define _(a,n,f) case (ICMP6_##a << 8) | (n): t = #f; break;
+
+ foreach_icmp6_code;
+
+#undef _
+ }
+
+ if (t)
+ s = format (s, " %s", t);
+
+ return s;
+}
+
+static u8 * format_icmp6_header (u8 * s, va_list * args)
+{
+ icmp46_header_t * icmp = va_arg (*args, icmp46_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (icmp[0]))
+ return format (s, "ICMP header truncated");
+
+ s = format (s, "ICMP %U checksum 0x%x",
+ format_ip6_icmp_type_and_code, icmp->type, icmp->code,
+ clib_net_to_host_u16 (icmp->checksum));
+
+ if (max_header_bytes >=
+ sizeof(icmp6_neighbor_solicitation_or_advertisement_header_t) &&
+ (icmp->type == ICMP6_neighbor_solicitation ||
+ icmp->type == ICMP6_neighbor_advertisement))
+ {
+ icmp6_neighbor_solicitation_or_advertisement_header_t *icmp6_nd =
+ (icmp6_neighbor_solicitation_or_advertisement_header_t *) icmp;
+ s = format (s, "\n target address %U",
+ format_ip6_address, &icmp6_nd->target_address);
+ }
+
+ return s;
+}
+
+u8 * format_icmp6_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ icmp6_input_trace_t * t = va_arg (*va, icmp6_input_trace_t *);
+
+ s = format (s, "%U",
+ format_ip6_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+static char * icmp_error_strings[] = {
+#define _(f,s) s,
+ foreach_icmp6_error
+#undef _
+};
+
+typedef enum {
+ ICMP_INPUT_NEXT_DROP,
+ ICMP_INPUT_N_NEXT,
+} icmp_input_next_t;
+
+typedef struct {
+ uword * type_and_code_by_name;
+
+ uword * type_by_name;
+
+ /* Vector dispatch table indexed by [icmp type]. */
+ u8 input_next_index_by_type[256];
+
+ /* Max valid code indexed by icmp type. */
+ u8 max_valid_code_by_type[256];
+
+ /* hop_limit must be >= this value for this icmp type. */
+ u8 min_valid_hop_limit_by_type[256];
+
+ u8 min_valid_length_by_type[256];
+} icmp6_main_t;
+
+icmp6_main_t icmp6_main;
+
+static uword
+ip6_icmp_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ icmp6_main_t * im = &icmp6_main;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next_index;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp6_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * b0;
+ ip6_header_t * ip0;
+ icmp46_header_t * icmp0;
+ icmp6_type_t type0;
+ u32 bi0, next0, error0, len0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+ icmp0 = ip6_next_header (ip0);
+ type0 = icmp0->type;
+
+ error0 = ICMP6_ERROR_NONE;
+
+ next0 = im->input_next_index_by_type[type0];
+ error0 = next0 == ICMP_INPUT_NEXT_DROP ? ICMP6_ERROR_UNKNOWN_TYPE : error0;
+
+ /* Check code is valid for type. */
+ error0 = icmp0->code > im->max_valid_code_by_type[type0] ? ICMP6_ERROR_INVALID_CODE_FOR_TYPE : error0;
+
+ /* Checksum is already validated by ip6_local node so we don't need to check that. */
+
+ /* Check that hop limit == 255 for certain types. */
+ error0 = ip0->hop_limit < im->min_valid_hop_limit_by_type[type0] ? ICMP6_ERROR_INVALID_HOP_LIMIT_FOR_TYPE : error0;
+
+ len0 = clib_net_to_host_u16 (ip0->payload_length);
+ error0 = len0 < im->min_valid_length_by_type[type0] ? ICMP6_ERROR_LENGTH_TOO_SMALL_FOR_TYPE : error0;
+
+ b0->error = node->errors[error0];
+
+ next0 = error0 != ICMP6_ERROR_NONE ? ICMP_INPUT_NEXT_DROP : next0;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_icmp_input_node) = {
+ .function = ip6_icmp_input,
+ .name = "ip6-icmp-input",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_errors = ARRAY_LEN (icmp_error_strings),
+ .error_strings = icmp_error_strings,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [ICMP_INPUT_NEXT_DROP] = "error-drop",
+ },
+};
+
+typedef enum {
+ ICMP6_ECHO_REQUEST_NEXT_LOOKUP,
+ ICMP6_ECHO_REQUEST_NEXT_OUTPUT,
+ ICMP6_ECHO_REQUEST_N_NEXT,
+} icmp6_echo_request_next_t;
+
+static uword
+ip6_icmp_echo_request (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next_index;
+ ip6_main_t * im = &ip6_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp6_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 2 && n_left_to_next > 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip6_header_t * ip0, * ip1;
+ icmp46_header_t * icmp0, * icmp1;
+ ip6_address_t tmp0, tmp1;
+ ip_csum_t sum0, sum1;
+ u32 bi0, bi1;
+ u32 fib_index0, fib_index1;
+ u32 next0 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP;
+ u32 next1 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP;
+
+ bi0 = to_next[0] = from[0];
+ bi1 = to_next[1] = from[1];
+
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ p1 = vlib_get_buffer (vm, bi1);
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+ icmp0 = ip6_next_header (ip0);
+ icmp1 = ip6_next_header (ip1);
+
+ /* Check icmp type to echo reply and update icmp checksum. */
+ sum0 = icmp0->checksum;
+ sum1 = icmp1->checksum;
+
+ ASSERT (icmp0->type == ICMP6_echo_request);
+ ASSERT (icmp1->type == ICMP6_echo_request);
+ sum0 = ip_csum_update (sum0, ICMP6_echo_request, ICMP6_echo_reply,
+ icmp46_header_t, type);
+ sum1 = ip_csum_update (sum1, ICMP6_echo_request, ICMP6_echo_reply,
+ icmp46_header_t, type);
+
+ icmp0->checksum = ip_csum_fold (sum0);
+ icmp1->checksum = ip_csum_fold (sum1);
+
+ icmp0->type = ICMP6_echo_reply;
+ icmp1->type = ICMP6_echo_reply;
+
+ /* Swap source and destination address. */
+ tmp0 = ip0->src_address;
+ tmp1 = ip1->src_address;
+
+ ip0->src_address = ip0->dst_address;
+ ip1->src_address = ip1->dst_address;
+
+ ip0->dst_address = tmp0;
+ ip1->dst_address = tmp1;
+
+ /* New hop count. */
+ ip0->hop_limit = im->host_config.ttl;
+ ip1->hop_limit = im->host_config.ttl;
+
+ if (ip6_address_is_link_local_unicast (&ip0->dst_address))
+ {
+ ethernet_header_t *eth0;
+ u8 tmp_mac[6];
+ /* For link local, reuse current MAC header by sawpping
+ * SMAC to DMAC instead of IP6 lookup since link local
+ * is not in the IP6 FIB */
+ vlib_buffer_reset (p0);
+ eth0 = vlib_buffer_get_current (p0);
+ memcpy (tmp_mac, eth0->dst_address, 6);
+ memcpy (eth0->dst_address, eth0->src_address, 6);
+ memcpy (eth0->src_address, tmp_mac, 6);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] =
+ vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ next0 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT;
+ }
+ else
+ {
+ /* Determine the correct lookup fib indices... */
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = fib_index0;
+ }
+
+ if (ip6_address_is_link_local_unicast (&ip1->dst_address))
+ {
+ ethernet_header_t *eth1;
+ u8 tmp_mac[6];
+ /* For link local, reuse current MAC header by sawpping
+ * SMAC to DMAC instead of IP6 lookup since link local
+ * is not in the IP6 FIB */
+ vlib_buffer_reset (p1);
+ eth1 = vlib_buffer_get_current (p1);
+ memcpy (tmp_mac, eth1->dst_address, 6);
+ memcpy (eth1->dst_address, eth1->src_address, 6);
+ memcpy (eth1->src_address, tmp_mac, 6);
+ vnet_buffer(p1)->sw_if_index[VLIB_TX] =
+ vnet_buffer (p1)->sw_if_index[VLIB_RX];
+ next1 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT;
+ }
+ else
+ {
+ /* Determine the correct lookup fib indices... */
+ fib_index1 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (p1)->sw_if_index[VLIB_RX]);
+ vnet_buffer (p1)->sw_if_index[VLIB_TX] = fib_index1;
+ }
+
+ vnet_buffer (p0)->sw_if_index[VLIB_RX]
+ = vnet_main.local_interface_sw_if_index;
+ vnet_buffer (p1)->sw_if_index[VLIB_RX]
+ = vnet_main.local_interface_sw_if_index;
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ icmp46_header_t * icmp0;
+ u32 bi0;
+ ip6_address_t tmp0;
+ ip_csum_t sum0;
+ u32 fib_index0;
+ u32 next0 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ icmp0 = ip6_next_header (ip0);
+
+ /* Check icmp type to echo reply and update icmp checksum. */
+ sum0 = icmp0->checksum;
+
+ ASSERT (icmp0->type == ICMP6_echo_request);
+ sum0 = ip_csum_update (sum0, ICMP6_echo_request, ICMP6_echo_reply,
+ icmp46_header_t, type);
+
+ icmp0->checksum = ip_csum_fold (sum0);
+
+ icmp0->type = ICMP6_echo_reply;
+
+ /* Swap source and destination address. */
+ tmp0 = ip0->src_address;
+ ip0->src_address = ip0->dst_address;
+ ip0->dst_address = tmp0;
+
+ ip0->hop_limit = im->host_config.ttl;
+
+ if (ip6_address_is_link_local_unicast (&ip0->dst_address))
+ {
+ ethernet_header_t *eth0;
+ u8 tmp_mac[6];
+ /* For link local, reuse current MAC header by sawpping
+ * SMAC to DMAC instead of IP6 lookup since link local
+ * is not in the IP6 FIB */
+ vlib_buffer_reset (p0);
+ eth0 = vlib_buffer_get_current (p0);
+ memcpy (tmp_mac, eth0->dst_address, 6);
+ memcpy (eth0->dst_address, eth0->src_address, 6);
+ memcpy (eth0->src_address, tmp_mac, 6);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] =
+ vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ next0 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT;
+ }
+ else
+ {
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = fib_index0;
+ }
+ vnet_buffer (p0)->sw_if_index[VLIB_RX]
+ = vnet_main.local_interface_sw_if_index;
+
+ /* Verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_error_count (vm, ip6_icmp_input_node.index,
+ ICMP6_ERROR_ECHO_REPLIES_SENT,
+ frame->n_vectors);
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_icmp_echo_request_node,static) = {
+ .function = ip6_icmp_echo_request,
+ .name = "ip6-icmp-echo-request",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_next_nodes = ICMP6_ECHO_REQUEST_N_NEXT,
+ .next_nodes = {
+ [ICMP6_ECHO_REQUEST_NEXT_LOOKUP] = "ip6-lookup",
+ [ICMP6_ECHO_REQUEST_NEXT_OUTPUT] = "interface-output",
+ },
+};
+
+typedef enum {
+ ICMP6_TTL_EXPIRE_NEXT_DROP,
+ ICMP6_TTL_EXPIRE_NEXT_LOOKUP,
+ ICMP6_TTL_EXPIRE_N_NEXT,
+} icmp_ttl_expire_next_t;
+
+static uword
+ip6_icmp_ttl_expire (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from, * to_next;
+ uword n_left_from, n_left_to_next;
+ icmp_ttl_expire_next_t next_index;
+ ip6_main_t *im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1, sizeof (icmp6_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 pi0 = from[0];
+ u32 next0 = ICMP6_TTL_EXPIRE_NEXT_LOOKUP;
+ u8 error0 = ICMP6_ERROR_TTL_EXPIRE_RESP_SENT;
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0, * out_ip0;
+ icmp46_header_t * icmp0;
+ u32 sw_if_index0, if_add_index0;
+ int bogus_length;
+
+ /* Speculatively enqueue p0 to the current next frame */
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip0 = vlib_buffer_get_current(p0);
+ sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX];
+
+ /* RFC2463 says to keep as much of the original packet as possible
+ * within the MTU. We cheat "a little" here by keeping whatever fits
+ * in the first buffer, to be more efficient */
+ if (PREDICT_FALSE(p0->total_length_not_including_first_buffer))
+ { /* clear current_length of all other buffers in chain */
+ vlib_buffer_t *b = p0;
+ p0->total_length_not_including_first_buffer = 0;
+ while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ b = vlib_get_buffer (vm, b->next_buffer);
+ b->current_length = 0;
+ }
+ }
+
+ /* Add IP header and ICMPv6 header including a 4 byte ununsed field */
+ vlib_buffer_advance(p0,
+ -sizeof(ip6_header_t)-sizeof(icmp46_header_t)-4);
+ out_ip0 = vlib_buffer_get_current(p0);
+ icmp0 = (icmp46_header_t *) &out_ip0[1];
+
+ /* Fill ip header fields */
+ out_ip0->ip_version_traffic_class_and_flow_label =
+ clib_host_to_net_u32(0x6<<28);
+ out_ip0->payload_length =
+ clib_host_to_net_u16(p0->current_length - sizeof(ip6_header_t));
+ out_ip0->protocol = IP_PROTOCOL_ICMP6;
+ out_ip0->hop_limit = 0xff;
+ out_ip0->dst_address = ip0->src_address;
+ if_add_index0 =
+ lm->if_address_pool_index_by_sw_if_index[sw_if_index0];
+ if (PREDICT_TRUE(if_add_index0 != ~0))
+ {
+ ip_interface_address_t *if_add =
+ pool_elt_at_index(lm->if_address_pool, if_add_index0);
+ ip6_address_t *if_ip =
+ ip_interface_address_get_address(lm, if_add);
+ out_ip0->src_address = *if_ip;
+ vlib_error_count (vm, node->node_index, error0, 1);
+ }
+ else /* interface has no IP6 address - should not happen */
+ {
+ next0 = ICMP6_TTL_EXPIRE_NEXT_DROP;
+ error0 = ICMP6_ERROR_TTL_EXPIRE_RESP_DROP;
+ }
+
+ /* Fill icmp header fields */
+ icmp0->type = ICMP6_time_exceeded;
+ icmp0->code = ICMP6_time_exceeded_ttl_exceeded_in_transit;
+ icmp0->checksum = 0;
+ icmp0->checksum = ip6_tcp_udp_icmp_compute_checksum(
+ vm, p0, out_ip0, &bogus_length);
+
+ /* Update error status */
+ p0->error = node->errors[error0];
+
+ /* Verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_icmp_ttl_expire_node) = {
+ .function = ip6_icmp_ttl_expire,
+ .name = "ip6-icmp-ttl-expire",
+ .vector_size = sizeof (u32),
+
+ .n_errors = ARRAY_LEN (icmp_error_strings),
+ .error_strings = icmp_error_strings,
+
+ .n_next_nodes = ICMP6_TTL_EXPIRE_N_NEXT,
+ .next_nodes = {
+ [ICMP6_TTL_EXPIRE_NEXT_DROP] = "error-drop",
+ [ICMP6_TTL_EXPIRE_NEXT_LOOKUP] = "ip6-lookup",
+ },
+
+ .format_trace = format_icmp6_input_trace,
+};
+
+
+static uword unformat_icmp_type_and_code (unformat_input_t * input, va_list * args)
+{
+ icmp46_header_t * h = va_arg (*args, icmp46_header_t *);
+ icmp6_main_t * cm = &icmp6_main;
+ u32 i;
+
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ cm->type_and_code_by_name, &i))
+ {
+ h->type = (i >> 8) & 0xff;
+ h->code = (i >> 0) & 0xff;
+ }
+ else if (unformat_user (input, unformat_vlib_number_by_name,
+ cm->type_by_name, &i))
+ {
+ h->type = i;
+ h->code = 0;
+ }
+ else
+ return 0;
+
+ return 1;
+}
+
+static void
+icmp6_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_offset, icmp_offset;
+ int bogus_length;
+
+ icmp_offset = g->start_byte_offset;
+ ip_offset = (g-1)->start_byte_offset;
+
+ while (n_packets >= 1)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ icmp46_header_t * icmp0;
+
+ p0 = vlib_get_buffer (vm, packets[0]);
+ n_packets -= 1;
+ packets += 1;
+
+ ASSERT (p0->current_data == 0);
+ ip0 = (void *) (p0->data + ip_offset);
+ icmp0 = (void *) (p0->data + icmp_offset);
+
+ icmp0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0,
+ &bogus_length);
+ ASSERT (bogus_length == 0);
+ }
+}
+
+typedef struct {
+ pg_edit_t type, code;
+ pg_edit_t checksum;
+} pg_icmp46_header_t;
+
+always_inline void
+pg_icmp_header_init (pg_icmp46_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, icmp46_header_t, f);
+ _ (type);
+ _ (code);
+ _ (checksum);
+#undef _
+}
+
+static uword
+unformat_pg_icmp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_icmp46_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (icmp46_header_t),
+ &group_index);
+ pg_icmp_header_init (p);
+
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+
+ {
+ icmp46_header_t tmp;
+
+ if (! unformat (input, "ICMP %U", unformat_icmp_type_and_code, &tmp))
+ goto error;
+
+ pg_edit_set_fixed (&p->type, tmp.type);
+ pg_edit_set_fixed (&p->code, tmp.code);
+ }
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "checksum %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->checksum))
+ ;
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = icmp6_pg_edit_function;
+ g->edit_function_opaque = 0;
+ }
+
+ return 1;
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
+void icmp6_register_type (vlib_main_t * vm, icmp6_type_t type, u32 node_index)
+{
+ icmp6_main_t * im = &icmp6_main;
+
+ ASSERT (type < ARRAY_LEN (im->input_next_index_by_type));
+ im->input_next_index_by_type[type]
+ = vlib_node_add_next (vm, ip6_icmp_input_node.index, node_index);
+}
+
+static clib_error_t *
+icmp6_init (vlib_main_t * vm)
+{
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+ icmp6_main_t * cm = &icmp6_main;
+ clib_error_t * error;
+
+ error = vlib_call_init_function (vm, ip_main_init);
+
+ if (error)
+ return error;
+
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_ICMP6);
+ pi->format_header = format_icmp6_header;
+ pi->unformat_pg_edit = unformat_pg_icmp_header;
+
+ cm->type_by_name = hash_create_string (0, sizeof (uword));
+#define _(n,t) hash_set_mem (cm->type_by_name, #t, (n));
+ foreach_icmp6_type;
+#undef _
+
+ cm->type_and_code_by_name = hash_create_string (0, sizeof (uword));
+#define _(a,n,t) hash_set_mem (cm->type_by_name, #t, (n) | (ICMP6_##a << 8));
+ foreach_icmp6_code;
+#undef _
+
+ memset (cm->input_next_index_by_type,
+ ICMP_INPUT_NEXT_DROP,
+ sizeof (cm->input_next_index_by_type));
+ memset (cm->max_valid_code_by_type, 0, sizeof (cm->max_valid_code_by_type));
+
+#define _(a,n,t) cm->max_valid_code_by_type[ICMP6_##a] = clib_max (cm->max_valid_code_by_type[ICMP6_##a], n);
+ foreach_icmp6_code;
+#undef _
+
+ memset (cm->min_valid_hop_limit_by_type, 0, sizeof (cm->min_valid_hop_limit_by_type));
+ cm->min_valid_hop_limit_by_type[ICMP6_router_solicitation] = 255;
+ cm->min_valid_hop_limit_by_type[ICMP6_router_advertisement] = 255;
+ cm->min_valid_hop_limit_by_type[ICMP6_neighbor_solicitation] = 255;
+ cm->min_valid_hop_limit_by_type[ICMP6_neighbor_advertisement] = 255;
+ cm->min_valid_hop_limit_by_type[ICMP6_redirect] = 255;
+
+ memset (cm->min_valid_length_by_type, sizeof (icmp46_header_t), sizeof (cm->min_valid_length_by_type));
+ cm->min_valid_length_by_type[ICMP6_router_solicitation] = sizeof (icmp6_neighbor_discovery_header_t);
+ cm->min_valid_length_by_type[ICMP6_router_advertisement] = sizeof (icmp6_router_advertisement_header_t);
+ cm->min_valid_length_by_type[ICMP6_neighbor_solicitation]
+ = sizeof (icmp6_neighbor_solicitation_or_advertisement_header_t);
+ cm->min_valid_length_by_type[ICMP6_neighbor_advertisement]
+ = sizeof (icmp6_neighbor_solicitation_or_advertisement_header_t);
+ cm->min_valid_length_by_type[ICMP6_redirect] = sizeof (icmp6_redirect_header_t);
+
+ icmp6_register_type (vm, ICMP6_echo_request, ip6_icmp_echo_request_node.index);
+
+ return vlib_call_init_function (vm, ip6_neighbor_init);
+}
+
+VLIB_INIT_FUNCTION (icmp6_init);
diff --git a/vnet/vnet/ip/icmp6.h b/vnet/vnet/ip/icmp6.h
new file mode 100644
index 00000000000..92f6913a454
--- /dev/null
+++ b/vnet/vnet/ip/icmp6.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_icmp6_h
+#define included_vnet_icmp6_h
+
+#define foreach_icmp6_error \
+ _ (NONE, "valid packets") \
+ _ (UNKNOWN_TYPE, "unknown type") \
+ _ (INVALID_CODE_FOR_TYPE, "invalid code for type") \
+ _ (INVALID_HOP_LIMIT_FOR_TYPE, "hop_limit != 255") \
+ _ (LENGTH_TOO_SMALL_FOR_TYPE, "payload length too small for type") \
+ _ (OPTIONS_WITH_ODD_LENGTH, \
+ "total option length not multiple of 8 bytes") \
+ _ (OPTION_WITH_ZERO_LENGTH, "option has zero length") \
+ _ (ECHO_REPLIES_SENT, "echo replies sent") \
+ _ (NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK, \
+ "neighbor solicitations from source not on link") \
+ _ (NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN, \
+ "neighbor solicitations for unknown targets") \
+ _ (NEIGHBOR_ADVERTISEMENTS_TX, "neighbor advertisements sent") \
+ _ (NEIGHBOR_ADVERTISEMENTS_RX, "neighbor advertisements received") \
+ _ (ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK, \
+ "router solicitations from source not on link") \
+ _ (ROUTER_SOLICITATION_UNSUPPORTED_INTF, \
+ "neighbor discovery unsupported interface") \
+ _ (ROUTER_SOLICITATION_RADV_NOT_CONFIG, \
+ "neighbor discovery not configured") \
+ _ (ROUTER_SOLICITATION_DEST_UNKNOWN, \
+ "router solicitations for unknown destination") \
+ _ (ROUTER_SOLICITATION_SOURCE_UNKNOWN, \
+ "router solicitations for unknown source") \
+ _ (ROUTER_ADVERTISEMENT_SOURCE_NOT_LINK_LOCAL, \
+ "router advertisement source not link local") \
+ _ (ROUTER_ADVERTISEMENTS_TX, "router advertisements sent") \
+ _ (ROUTER_ADVERTISEMENTS_RX, "router advertisements received") \
+ _ (DST_LOOKUP_MISS, "icmp6 dst address lookup misses") \
+ _ (TTL_EXPIRE_RESP_SENT, "TTL time exceeded response sent") \
+ _ (TTL_EXPIRE_RESP_DROP, "TTL time exceeded response dropped")
+
+
+typedef enum {
+#define _(f,s) ICMP6_ERROR_##f,
+ foreach_icmp6_error
+#undef _
+} icmp6_error_t;
+
+typedef struct {
+ u8 packet_data[64];
+} icmp6_input_trace_t;
+
+format_function_t format_icmp6_input_trace;
+void icmp6_register_type (vlib_main_t * vm, icmp6_type_t type, u32 node_index);
+
+extern vlib_node_registration_t ip6_icmp_input_node;
+
+#endif /* included_vnet_icmp6_h */
+
+
diff --git a/vnet/vnet/ip/igmp_packet.h b/vnet/vnet/ip/igmp_packet.h
new file mode 100644
index 00000000000..00b1e0deeb7
--- /dev/null
+++ b/vnet/vnet/ip/igmp_packet.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * igmp_packet.h: igmp packet format
+ *
+ * Copyright (c) 2011 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_igmp_packet_h
+#define included_vnet_igmp_packet_h
+
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+
+#define foreach_igmp_type \
+ _ (0x11, membership_query) \
+ _ (0x12, membership_report_v1) \
+ _ (0x13, dvmrp) \
+ _ (0x14, pim_v1) \
+ _ (0x15, cisco_trace) \
+ _ (0x16, membership_report_v2) \
+ _ (0x17, leave_group_v2) \
+ _ (0x1e, traceroute_response) \
+ _ (0x1f, traceroute_request) \
+ _ (0x22, membership_report_v3) \
+ _ (0x30, router_advertisement) \
+ _ (0x31, router_solicitation) \
+ _ (0x32, router_termination)
+
+typedef enum {
+#define _(n,f) IGMP_TYPE_##f = n,
+ foreach_igmp_type
+#undef _
+} igmp_type_t;
+
+typedef struct {
+ igmp_type_t type : 8;
+
+ u8 code;
+
+ u16 checksum;
+} igmp_header_t;
+
+typedef struct {
+ /* membership_query, version <= 2 reports. */
+ igmp_header_t header;
+
+ /* Multicast destination address. */
+ ip4_address_t dst;
+} igmp_message_t;
+
+#define foreach_igmp_membership_group_v3_type \
+ _ (1, mode_is_filter_include) \
+ _ (2, mode_is_filter_exclude) \
+ _ (3, change_to_filter_include) \
+ _ (4, change_to_filter_exclude) \
+ _ (5, allow_new_sources) \
+ _ (6, block_old_sources)
+
+typedef enum {
+#define _(n,f) IGMP_MEMBERSHIP_GROUP_##f = n,
+ foreach_igmp_membership_group_v3_type
+#undef _
+} igmp_membership_group_v3_type_t;
+
+typedef struct {
+ igmp_membership_group_v3_type_t type : 8;
+
+ /* Number of 32 bit words of aux data after source addresses. */
+ u8 n_aux_u32s;
+
+ /* Number of source addresses that follow. */
+ u16 n_src_addresses;
+
+ /* Destination multicast address. */
+ ip4_address_t dst_address;
+
+ ip4_address_t src_addresses[0];
+} igmp_membership_group_v3_t;
+
+always_inline igmp_membership_group_v3_t *
+igmp_membership_group_v3_next (igmp_membership_group_v3_t * g)
+{
+ return ((void *) g
+ + g->n_src_addresses * sizeof (g->src_addresses[0])
+ + g->n_aux_u32s * sizeof (u32));
+}
+
+typedef struct {
+ /* Type 0x22. */
+ igmp_header_t header;
+
+ u16 unused;
+
+ /* Number of groups which follow. */
+ u16 n_groups;
+
+ igmp_membership_group_v3_t groups[0];
+} igmp_membership_report_v3_t;
+
+/* IP6 flavor of IGMP is called MLD which is embedded in ICMP6. */
+typedef struct {
+ /* Preceeded by ICMP v6 header. */
+ u16 max_response_delay_in_milliseconds;
+ u16 reserved;
+ ip6_address_t dst;
+} mld_header_t;
+
+#endif /* included_vnet_igmp_packet_h */
diff --git a/vnet/vnet/ip/ip.h b/vnet/vnet/ip/ip.h
new file mode 100644
index 00000000000..e47512a960d
--- /dev/null
+++ b/vnet/vnet/ip/ip.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip.h: ip generic (4 or 6) main
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_main_h
+#define included_ip_main_h
+
+#include <vppinfra/hash.h>
+#include <vppinfra/heap.h> /* adjacency heap */
+
+#include <vnet/vnet.h>
+
+#include <vnet/ip/format.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/lookup.h>
+
+#include <vnet/ip/tcp_packet.h>
+#include <vnet/ip/udp_packet.h>
+#include <vnet/ip/icmp46_packet.h>
+
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip4_error.h>
+#include <vnet/ip/ip4_packet.h>
+
+#include <vnet/ip/ip6.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/ip/ip6_error.h>
+#include <vnet/ip/icmp6.h>
+
+#include <vnet/ip/tcp.h>
+
+#if DPDK > 0
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
+#include <vnet/classify/vnet_classify.h>
+
+typedef union {
+ ip4_address_t ip4;
+ ip6_address_t ip6;
+} ip46_address_t;
+
+/* Per protocol info. */
+typedef struct {
+ /* Protocol name (also used as hash key). */
+ u8 * name;
+
+ /* Protocol number. */
+ ip_protocol_t protocol;
+
+ /* Format function for this IP protocol. */
+ format_function_t * format_header;
+
+ /* Parser for header. */
+ unformat_function_t * unformat_header;
+
+ /* Parser for per-protocol matches. */
+ unformat_function_t * unformat_match;
+
+ /* Parser for packet generator edits for this protocol. */
+ unformat_function_t * unformat_pg_edit;
+} ip_protocol_info_t;
+
+/* Per TCP/UDP port info. */
+typedef struct {
+ /* Port name (used as hash key). */
+ u8 * name;
+
+ /* UDP/TCP port number in network byte order. */
+ u16 port;
+
+ /* Port specific format function. */
+ format_function_t * format_header;
+
+ /* Parser for packet generator edits for this protocol. */
+ unformat_function_t * unformat_pg_edit;
+} tcp_udp_port_info_t;
+
+typedef struct {
+ /* Per IP protocol info. */
+ ip_protocol_info_t * protocol_infos;
+
+ /* Protocol info index hashed by 8 bit IP protocol. */
+ uword * protocol_info_by_protocol;
+
+ /* Hash table mapping IP protocol name (see protocols.def)
+ to protocol number. */
+ uword * protocol_info_by_name;
+
+ /* Per TCP/UDP port info. */
+ tcp_udp_port_info_t * port_infos;
+
+ /* Hash table from network-byte-order port to port info index. */
+ uword * port_info_by_port;
+
+ /* Hash table mapping TCP/UDP name to port info index. */
+ uword * port_info_by_name;
+} ip_main_t;
+
+extern ip_main_t ip_main;
+
+clib_error_t *
+ip_main_init (vlib_main_t * vm);
+
+static inline ip_protocol_info_t *
+ip_get_protocol_info (ip_main_t * im, u32 protocol)
+{
+ uword * p;
+
+ p = hash_get (im->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (im->protocol_infos, p[0]) : 0;
+}
+
+static inline tcp_udp_port_info_t *
+ip_get_tcp_udp_port_info (ip_main_t * im, u32 port)
+{
+ uword * p;
+
+ p = hash_get (im->port_info_by_port, port);
+ return p ? vec_elt_at_index (im->port_infos, p[0]) : 0;
+}
+
+always_inline ip_csum_t
+ip_incremental_checksum_buffer (vlib_main_t * vm, vlib_buffer_t * first_buffer,
+ u32 first_buffer_offset,
+ u32 n_bytes_to_checksum,
+ ip_csum_t sum)
+#if DPDK > 0
+{
+ u32 n_bytes_left = n_bytes_to_checksum;
+ struct rte_mbuf * mb = ((struct rte_mbuf *)first_buffer)-1;
+ u8 nb_segs = mb->nb_segs;
+ ASSERT(mb->data_len >= first_buffer_offset);
+ void * h;
+ u32 n;
+
+ n = clib_min (n_bytes_left, mb->data_len);
+ h = vlib_buffer_get_current (first_buffer) + first_buffer_offset;
+ while (n_bytes_left)
+ {
+ sum = ip_incremental_checksum (sum, h, n);
+ n_bytes_left -= n;
+ nb_segs--;
+ mb = mb->next;
+ if ((nb_segs == 0) || (mb == 0))
+ break;
+
+ n = clib_min (n_bytes_left, mb->data_len);
+ h = rte_ctrlmbuf_data(mb);
+ }
+
+ ASSERT(n_bytes_left == 0);
+ ASSERT(nb_segs == 0);
+ return sum;
+}
+#else
+{
+ vlib_buffer_t * b = first_buffer;
+ u32 n_bytes_left = n_bytes_to_checksum;
+ ASSERT (b->current_length >= first_buffer_offset);
+ void * h;
+ u32 n;
+
+ n = clib_min (n_bytes_left, b->current_length);
+ h = vlib_buffer_get_current (b) + first_buffer_offset;
+ sum = ip_incremental_checksum (sum, h, n);
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ while (1)
+ {
+ n_bytes_left -= n;
+ if (n_bytes_left == 0)
+ break;
+ b = vlib_get_buffer (vm, b->next_buffer);
+ n = clib_min (n_bytes_left, b->current_length);
+ h = vlib_buffer_get_current (b);
+ sum = ip_incremental_checksum (sum, h, n);
+ }
+ }
+
+ return sum;
+}
+#endif /* DPDK */
+
+void ip_del_all_interface_addresses (vlib_main_t *vm, u32 sw_if_index);
+
+#endif /* included_ip_main_h */
diff --git a/vnet/vnet/ip/ip4.h b/vnet/vnet/ip/ip4.h
new file mode 100644
index 00000000000..6b8fd59a022
--- /dev/null
+++ b/vnet/vnet/ip/ip4.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4.h: ip4 main include file
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_ip4_h
+#define included_ip_ip4_h
+
+#include <vnet/ip/ip4_mtrie.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/lookup.h>
+
+typedef struct ip4_fib_t {
+ /* Hash table for each prefix length mapping. */
+ uword * adj_index_by_dst_address[33];
+
+ /* Temporary vectors for holding new/old values for hash_set. */
+ uword * new_hash_values, * old_hash_values;
+
+ /* Mtrie for fast lookups. Hash is used to maintain overlapping prefixes. */
+ ip4_fib_mtrie_t mtrie;
+
+ /* Table ID (hash key) for this FIB. */
+ u32 table_id;
+
+ /* Index into FIB vector. */
+ u32 index;
+
+ /* flow hash configuration */
+ u32 flow_hash_config;
+
+ /* N-tuple classifier indices */
+ u32 fwd_classify_table_index;
+ u32 rev_classify_table_index;
+
+} ip4_fib_t;
+
+struct ip4_main_t;
+
+typedef void (ip4_add_del_route_function_t)
+ (struct ip4_main_t * im,
+ uword opaque,
+ ip4_fib_t * fib,
+ u32 flags,
+ ip4_address_t * address,
+ u32 address_length,
+ void * old_result,
+ void * new_result);
+
+typedef struct {
+ ip4_add_del_route_function_t * function;
+ uword required_flags;
+ uword function_opaque;
+} ip4_add_del_route_callback_t;
+
+typedef void (ip4_add_del_interface_address_function_t)
+ (struct ip4_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip4_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_del);
+
+typedef struct {
+ ip4_add_del_interface_address_function_t * function;
+ uword function_opaque;
+} ip4_add_del_interface_address_callback_t;
+
+typedef enum {
+ /* First check access list to either permit or deny this
+ packet based on classification. */
+ IP4_RX_FEATURE_CHECK_ACCESS,
+
+ /* RPF check: verify that source address is reachable via
+ RX interface or via any interface. */
+ IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX,
+ IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_ANY,
+
+ /* IPSec */
+ IP4_RX_FEATURE_IPSEC,
+
+ /* vPath forwarding: won't return to call next feature
+ so any feature needed before vPath forwarding must be prior
+ to this entry */
+ IP4_RX_FEATURE_VPATH,
+
+ /* Must be last: perform forwarding lookup. */
+ IP4_RX_FEATURE_LOOKUP,
+
+ IP4_N_RX_FEATURE,
+} ip4_rx_feature_type_t;
+
+typedef struct ip4_main_t {
+ ip_lookup_main_t lookup_main;
+
+ /* Vector of FIBs. */
+ ip4_fib_t * fibs;
+
+ u32 fib_masks[33];
+
+ /* Table index indexed by software interface. */
+ u32 * fib_index_by_sw_if_index;
+
+ /* Hash table mapping table id to fib index.
+ ID space is not necessarily dense; index space is dense. */
+ uword * fib_index_by_table_id;
+
+ /* Vector of functions to call when routes are added/deleted. */
+ ip4_add_del_route_callback_t * add_del_route_callbacks;
+
+ /* Hash table mapping interface route rewrite adjacency index by sw if index. */
+ uword * interface_route_adj_index_by_sw_if_index;
+
+ /* Functions to call when interface address changes. */
+ ip4_add_del_interface_address_callback_t * add_del_interface_address_callbacks;
+
+ /* Template used to generate IP4 ARP packets. */
+ vlib_packet_template_t ip4_arp_request_packet_template;
+
+ /* Seed for Jenkins hash used to compute ip4 flow hash. */
+ u32 flow_hash_seed;
+
+ struct {
+ /* TTL to use for host generated packets. */
+ u8 ttl;
+
+ /* TOS byte to use for host generated packets. */
+ u8 tos;
+
+ u8 pad[2];
+ } host_config;
+} ip4_main_t;
+
+/* Global ip4 main structure. */
+extern ip4_main_t ip4_main;
+
+/* Global ip4 input node. Errors get attached to ip4 input node. */
+extern vlib_node_registration_t ip4_input_node;
+extern vlib_node_registration_t ip4_lookup_node;
+extern vlib_node_registration_t ip4_rewrite_node;
+extern vlib_node_registration_t ip4_arp_node;
+
+u32 ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index, ip4_address_t * dst,
+ u32 disable_default_route);
+
+always_inline u32
+ip4_fib_lookup_buffer (ip4_main_t * im, u32 fib_index, ip4_address_t * dst,
+ vlib_buffer_t * b)
+{
+ return ip4_fib_lookup_with_table (im, fib_index, dst,
+ /* disable_default_route */ 0);
+}
+
+always_inline u32
+ip4_fib_lookup (ip4_main_t * im, u32 sw_if_index, ip4_address_t * dst)
+{
+ u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
+ return ip4_fib_lookup_with_table (im, fib_index, dst,
+ /* disable_default_route */ 0);
+}
+
+always_inline uword
+ip4_destination_matches_route (ip4_main_t * im,
+ ip4_address_t * key,
+ ip4_address_t * dest,
+ uword dest_length)
+{ return 0 == ((key->data_u32 ^ dest->data_u32) & im->fib_masks[dest_length]); }
+
+always_inline uword
+ip4_destination_matches_interface (ip4_main_t * im,
+ ip4_address_t * key,
+ ip_interface_address_t * ia)
+{
+ ip4_address_t * a = ip_interface_address_get_address (&im->lookup_main, ia);
+ return ip4_destination_matches_route (im, key, a, ia->address_length);
+}
+
+/* As above but allows for unaligned destinations (e.g. works right from IP header of packet). */
+always_inline uword
+ip4_unaligned_destination_matches_route (ip4_main_t * im,
+ ip4_address_t * key,
+ ip4_address_t * dest,
+ uword dest_length)
+{ return 0 == ((clib_mem_unaligned (&key->data_u32, u32) ^ dest->data_u32) & im->fib_masks[dest_length]); }
+
+always_inline void
+ip4_src_address_for_packet (ip4_main_t * im, vlib_buffer_t * p, ip4_address_t * src, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index);
+ ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+ *src = a[0];
+}
+
+/* Find interface address which matches destination. */
+always_inline ip4_address_t *
+ip4_interface_address_matching_destination (ip4_main_t * im, ip4_address_t * dst, u32 sw_if_index,
+ ip_interface_address_t ** result_ia)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia;
+ ip4_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+ if (ip4_destination_matches_route (im, dst, a, ia->address_length))
+ {
+ result = a;
+ break;
+ }
+ }));
+ if (result_ia)
+ *result_ia = result ? ia : 0;
+ return result;
+}
+
+clib_error_t *
+ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
+ ip4_address_t * address, u32 address_length,
+ u32 is_del);
+
+int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2);
+
+/* Add/del a route to the FIB. */
+
+#define IP4_ROUTE_FLAG_ADD (0 << 0)
+#define IP4_ROUTE_FLAG_DEL (1 << 0)
+#define IP4_ROUTE_FLAG_TABLE_ID (0 << 1)
+#define IP4_ROUTE_FLAG_FIB_INDEX (1 << 1)
+#define IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2)
+#define IP4_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3)
+/* Not last add/del in group. Facilities batching requests into packets. */
+#define IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4)
+/* Dynamic route created via ARP reply. */
+#define IP4_ROUTE_FLAG_NEIGHBOR (1 << 5)
+
+typedef struct {
+ /* IP4_ROUTE_FLAG_* */
+ u32 flags;
+
+ /* Either index of fib or table_id to hash and get fib.
+ IP4_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */
+ u32 table_index_or_table_id;
+
+ /* Destination address (prefix) and length. */
+ ip4_address_t dst_address;
+ u32 dst_address_length;
+
+ /* Adjacency to use for this destination. */
+ u32 adj_index;
+
+ /* If specified adjacencies to add and then
+ use for this destination. add_adj/n_add_adj
+ are override adj_index if specified. */
+ ip_adjacency_t * add_adj;
+ u32 n_add_adj;
+} ip4_add_del_route_args_t;
+
+ip4_fib_t *
+find_ip4_fib_by_table_index_or_id (ip4_main_t * im,
+ u32 table_index_or_id, u32 flags);
+
+void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * args);
+
+void ip4_add_del_route_next_hop (ip4_main_t * im,
+ u32 flags,
+ ip4_address_t * dst_address,
+ u32 dst_address_length,
+ ip4_address_t * next_hop,
+ u32 next_hop_sw_if_index,
+ u32 next_hop_weight, u32 adj_index,
+ u32 explicit_fib_index);
+
+void *
+ip4_get_route (ip4_main_t * im,
+ u32 fib_index_or_table_id,
+ u32 flags,
+ u8 * address,
+ u32 address_length);
+
+void
+ip4_foreach_matching_route (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip4_address_t * address,
+ u32 address_length,
+ ip4_address_t ** results,
+ u8 ** result_lengths);
+
+void ip4_delete_matching_routes (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip4_address_t * address,
+ u32 address_length);
+
+void ip4_maybe_remap_adjacencies (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags);
+
+void ip4_adjacency_set_interface_route (vnet_main_t * vnm,
+ ip_adjacency_t * adj,
+ u32 sw_if_index,
+ u32 if_address_index);
+
+/* Send an ARP request to see if given destination is reachable on given interface. */
+clib_error_t *
+ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index);
+
+clib_error_t *
+ip4_set_arp_limit (u32 arp_limit);
+
+uword
+ip4_tcp_register_listener (vlib_main_t * vm,
+ u16 dst_port,
+ u32 next_node_index);
+uword
+ip4_udp_register_listener (vlib_main_t * vm,
+ u16 dst_port,
+ u32 next_node_index);
+
+void
+ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type,
+ u32 node_index);
+
+u16 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip4_header_t * ip0);
+
+void ip4_register_protocol (u32 protocol, u32 node_index);
+
+serialize_function_t serialize_vnet_ip4_main, unserialize_vnet_ip4_main;
+
+int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config);
+
+void ip4_mtrie_init (ip4_fib_mtrie_t * m);
+
+int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 table_index);
+
+/* Compute flow hash. We'll use it to select which adjacency to use for this
+ flow. And other things. */
+always_inline u32
+ip4_compute_flow_hash (ip4_header_t * ip, u32 flow_hash_config)
+{
+ tcp_header_t * tcp = (void *) (ip + 1);
+ u32 a, b, c, t1, t2;
+ uword is_tcp_udp = (ip->protocol == IP_PROTOCOL_TCP
+ || ip->protocol == IP_PROTOCOL_UDP);
+
+ t1 = (flow_hash_config & IP_FLOW_HASH_SRC_ADDR)
+ ? ip->src_address.data_u32 : 0;
+ t2 = (flow_hash_config & IP_FLOW_HASH_DST_ADDR)
+ ? ip->dst_address.data_u32 : 0;
+
+ a = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t2 : t1;
+ b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2;
+ b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0;
+
+ t1 = is_tcp_udp ? tcp->ports.src : 0;
+ t2 = is_tcp_udp ? tcp->ports.dst : 0;
+
+ t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0;
+ t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0;
+
+ c = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ?
+ (t1<<16) | t2 : (t2<<16) | t1;
+
+ hash_v3_mix32 (a, b, c);
+ hash_v3_finalize32 (a, b, c);
+
+ return c;
+}
+
+#endif /* included_ip_ip4_h */
diff --git a/vnet/vnet/ip/ip46_cli.c b/vnet/vnet/ip/ip46_cli.c
new file mode 100644
index 00000000000..44dde9bf3e7
--- /dev/null
+++ b/vnet/vnet/ip/ip46_cli.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_cli.c: ip4 commands
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2)
+{ return clib_net_to_host_u32 (a1->data_u32) - clib_net_to_host_u32 (a2->data_u32); }
+
+int ip6_address_compare (ip6_address_t * a1, ip6_address_t * a2)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a1->as_u16); i++)
+ {
+ int cmp = clib_net_to_host_u16 (a1->as_u16[i]) - clib_net_to_host_u16 (a2->as_u16[i]);
+ if (cmp != 0)
+ return cmp;
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_command, static) = {
+ .path = "set interface ip",
+ .short_help = "IP4/IP6 commands",
+};
+
+void ip_del_all_interface_addresses (vlib_main_t *vm, u32 sw_if_index)
+{
+ ip4_main_t * im4 = &ip4_main;
+ ip4_address_t * ip4_addrs = 0;
+ u32 *ip4_masks = 0;
+ ip6_main_t * im6 = &ip6_main;
+ ip6_address_t * ip6_addrs = 0;
+ u32 *ip6_masks = 0;
+ ip_interface_address_t * ia;
+ int i;
+
+ foreach_ip_interface_address (&im4->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ ip4_address_t * x = (ip4_address_t *)
+ ip_interface_address_get_address (&im4->lookup_main, ia);
+ vec_add1 (ip4_addrs, x[0]);
+ vec_add1 (ip4_masks, ia->address_length);
+ }));
+
+ foreach_ip_interface_address (&im6->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ ip6_address_t * x = (ip6_address_t *)
+ ip_interface_address_get_address (&im6->lookup_main, ia);
+ vec_add1 (ip6_addrs, x[0]);
+ vec_add1 (ip6_masks, ia->address_length);
+ }));
+
+ for (i = 0; i < vec_len (ip4_addrs); i++)
+ ip4_add_del_interface_address (vm, sw_if_index, &ip4_addrs[i],
+ ip4_masks[i], 1 /* is_del */);
+ for (i = 0; i < vec_len (ip6_addrs); i++)
+ ip6_add_del_interface_address (vm, sw_if_index, &ip6_addrs[i],
+ ip6_masks[i], 1 /* is_del */);
+
+ vec_free (ip4_addrs);
+ vec_free (ip4_masks);
+ vec_free (ip6_addrs);
+ vec_free (ip6_masks);
+}
+
+static clib_error_t *
+add_del_ip_address (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t a4;
+ ip6_address_t a6;
+ clib_error_t * error = 0;
+ u32 sw_if_index, length, is_del;
+
+ sw_if_index = ~0;
+ is_del = 0;
+
+ if (unformat (input, "del"))
+ is_del = 1;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (is_del && unformat (input, "all"))
+ ip_del_all_interface_addresses (vm, sw_if_index);
+ else if (unformat (input, "%U/%d", unformat_ip4_address, &a4, &length))
+ error = ip4_add_del_interface_address (vm, sw_if_index, &a4, length,
+ is_del);
+ else if (unformat (input, "%U/%d", unformat_ip6_address, &a6, &length))
+ error = ip6_add_del_interface_address (vm, sw_if_index, &a6, length,
+ is_del);
+ else
+ {
+ error = clib_error_return (0, "expected IP4/IP6 address/length `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_address_command, static) = {
+ .path = "set interface ip address",
+ .function = add_del_ip_address,
+ .short_help = "Add/delete IP4/IP6 address for interface",
+};
+
+/* Dummy init function to get us linked in. */
+static clib_error_t * ip4_cli_init (vlib_main_t * vm)
+{ return 0; }
+
+VLIB_INIT_FUNCTION (ip4_cli_init);
diff --git a/vnet/vnet/ip/ip4_error.h b/vnet/vnet/ip/ip4_error.h
new file mode 100644
index 00000000000..b84b082b993
--- /dev/null
+++ b/vnet/vnet/ip/ip4_error.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_error.h: ip4 fast path errors
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_ip4_error_h
+#define included_ip_ip4_error_h
+
+#define foreach_ip4_error \
+ /* Must be first. */ \
+ _ (NONE, "valid ip4 packets") \
+ \
+ /* Errors signalled by ip4-input */ \
+ _ (TOO_SHORT, "ip4 length < 20 bytes") \
+ _ (BAD_LENGTH, "ip4 length > l2 length") \
+ _ (BAD_CHECKSUM, "bad ip4 checksum") \
+ _ (VERSION, "ip4 version != 4") \
+ _ (OPTIONS, "ip4 options present") \
+ _ (FRAGMENT_OFFSET_ONE, "ip4 fragment offset == 1") \
+ _ (TIME_EXPIRED, "ip4 ttl <= 1") \
+ \
+ /* Errors signalled by ip4-rewrite. */ \
+ _ (MTU_EXCEEDED, "ip4 MTU exceeded and DF set") \
+ _ (DST_LOOKUP_MISS, "ip4 destination lookup miss") \
+ _ (SRC_LOOKUP_MISS, "ip4 source lookup miss") \
+ _ (ADJACENCY_DROP, "ip4 adjacency drop") \
+ _ (ADJACENCY_PUNT, "ip4 adjacency punt") \
+ \
+ /* Errors signalled by ip4-local. */ \
+ _ (UNKNOWN_PROTOCOL, "unknown ip protocol") \
+ _ (TCP_CHECKSUM, "bad tcp checksum") \
+ _ (UDP_CHECKSUM, "bad udp checksum") \
+ _ (UDP_LENGTH, "inconsistent udp/ip lengths") \
+ \
+ /* Errors signalled by ip4-source-check. */ \
+ _ (UNICAST_SOURCE_CHECK_FAILS, "ip4 unicast source check fails") \
+ \
+ /* Spoofed packets in ip4-rewrite-local */ \
+ _(SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops") \
+ \
+ /* Erros singalled by ip4-inacl */ \
+ _ (INACL_TABLE_MISS, "input ACL table-miss drops") \
+ _ (INACL_SESSION_DENY, "input ACL session deny drops")
+
+typedef enum {
+#define _(sym,str) IP4_ERROR_##sym,
+ foreach_ip4_error
+#undef _
+ IP4_N_ERROR,
+} ip4_error_t;
+
+#endif /* included_ip_ip4_error_h */
diff --git a/vnet/vnet/ip/ip4_format.c b/vnet/vnet/ip/ip4_format.c
new file mode 100644
index 00000000000..5f4f8e3667d
--- /dev/null
+++ b/vnet/vnet/ip/ip4_format.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_format.c: ip4 formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+/* Format an IP4 address. */
+u8 * format_ip4_address (u8 * s, va_list * args)
+{
+ u8 * a = va_arg (*args, u8 *);
+ return format (s, "%d.%d.%d.%d", a[0], a[1], a[2], a[3]);
+}
+
+/* Format an IP4 route destination and length. */
+u8 * format_ip4_address_and_length (u8 * s, va_list * args)
+{
+ u8 * a = va_arg (*args, u8 *);
+ u8 l = va_arg (*args, u32);
+ return format (s, "%U/%d", format_ip4_address, a, l);
+}
+
+/* Parse an IP4 address %d.%d.%d.%d. */
+uword unformat_ip4_address (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ unsigned a[4];
+
+ if (! unformat (input, "%d.%d.%d.%d", &a[0], &a[1], &a[2], &a[3]))
+ return 0;
+
+ if (a[0] >= 256 || a[1] >= 256 || a[2] >= 256 || a[3] >= 256)
+ return 0;
+
+ result[0] = a[0];
+ result[1] = a[1];
+ result[2] = a[2];
+ result[3] = a[3];
+
+ return 1;
+}
+
+/* Format an IP4 header. */
+u8 * format_ip4_header (u8 * s, va_list * args)
+{
+ ip4_header_t * ip = va_arg (*args, ip4_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 ip_version, header_bytes;
+ uword indent;
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (ip[0]))
+ return format (s, "IP header truncated");
+
+ indent = format_get_indent (s);
+ indent += 2;
+
+ ip_version = (ip->ip_version_and_header_length >> 4);
+ header_bytes = (ip->ip_version_and_header_length & 0xf) * sizeof (u32);
+
+ s = format (s, "%U: %U -> %U",
+ format_ip_protocol, ip->protocol,
+ format_ip4_address, ip->src_address.data,
+ format_ip4_address, ip->dst_address.data);
+
+ /* Show IP version and header length only with unexpected values. */
+ if (ip_version != 4 || header_bytes != sizeof (ip4_header_t))
+ s = format (s, "\n%Uversion %d, header length %d",
+ format_white_space, indent,
+ ip_version, header_bytes);
+
+ s = format (s, "\n%Utos 0x%02x, ttl %d, length %d, checksum 0x%04x",
+ format_white_space, indent,
+ ip->tos, ip->ttl,
+ clib_net_to_host_u16 (ip->length),
+ clib_net_to_host_u16 (ip->checksum));
+
+ /* Check and report invalid checksums. */
+ {
+ u16 c = ip4_header_checksum (ip);
+ if (c != ip->checksum)
+ s = format (s, " (should be 0x%04x)", clib_net_to_host_u16 (c));
+ }
+
+ {
+ u32 f = clib_net_to_host_u16 (ip->flags_and_fragment_offset);
+ u32 o;
+
+ s = format (s, "\n%Ufragment id 0x%04x",
+ format_white_space, indent,
+ clib_net_to_host_u16 (ip->fragment_id));
+
+ /* Fragment offset. */
+ o = 8 * (f & 0x1fff);
+ f ^= o;
+ if (o != 0)
+ s = format (s, " offset %d", o);
+
+ if (f != 0)
+ {
+ s = format (s, ", flags ");
+#define _(l) if (f & IP4_HEADER_FLAG_##l) s = format (s, #l);
+ _ (MORE_FRAGMENTS);
+ _ (DONT_FRAGMENT);
+ _ (CONGESTION);
+#undef _
+ }
+ }
+
+ /* Recurse into next protocol layer. */
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol);
+
+ if (pi && pi->format_header)
+ s = format (s, "\n%U%U",
+ format_white_space, indent - 2,
+ pi->format_header,
+ /* next protocol header */ (void*) ip + header_bytes,
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+/* Parse an IP4 header. */
+uword unformat_ip4_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ ip4_header_t * ip;
+ int old_length;
+
+ /* Allocate space for IP header. */
+ {
+ void * p;
+
+ old_length = vec_len (*result);
+ vec_add2 (*result, p, sizeof (ip4_header_t));
+ ip = p;
+ }
+
+ memset (ip, 0, sizeof (ip[0]));
+ ip->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_ip_protocol, &ip->protocol,
+ unformat_ip4_address, &ip->src_address,
+ unformat_ip4_address, &ip->dst_address))
+ return 0;
+
+ /* Parse options. */
+ while (1)
+ {
+ int i, j;
+
+ if (unformat (input, "tos %U", unformat_vlib_number, &i))
+ ip->tos = i;
+
+ else if (unformat (input, "ttl %U", unformat_vlib_number, &i))
+ ip->ttl = i;
+
+ else if (unformat (input, "fragment id %U offset %U",
+ unformat_vlib_number, &i,
+ unformat_vlib_number, &j))
+ {
+ ip->fragment_id = clib_host_to_net_u16 (i);
+ ip->flags_and_fragment_offset |=
+ clib_host_to_net_u16 ((i / 8) & 0x1fff);
+ }
+
+ /* Flags. */
+ else if (unformat (input, "mf") || unformat (input, "MF"))
+ ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS);
+
+ else if (unformat (input, "df") || unformat (input, "DF"))
+ ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT);
+
+ else if (unformat (input, "ce") || unformat (input, "CE"))
+ ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_CONGESTION);
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ /* Fill in checksum. */
+ ip->checksum = ip4_header_checksum (ip);
+
+ /* Recurse into next protocol layer. */
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol);
+
+ if (pi && pi->unformat_header)
+ {
+ if (! unformat_user (input, pi->unformat_header, result))
+ return 0;
+
+ /* Result may have moved. */
+ ip = (void *) *result + old_length;
+ }
+ }
+
+ /* Fill in IP length. */
+ ip->length = clib_host_to_net_u16 (vec_len (*result) - old_length);
+
+ return 1;
+}
diff --git a/vnet/vnet/ip/ip4_forward.c b/vnet/vnet/ip/ip4_forward.c
new file mode 100644
index 00000000000..fd304163a6b
--- /dev/null
+++ b/vnet/vnet/ip/ip4_forward.c
@@ -0,0 +1,3564 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_forward.c: IP v4 forwarding
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */
+#include <vnet/ethernet/arp_packet.h> /* for ethernet_arp_header_t */
+#include <vnet/ppp/ppp.h>
+#include <vnet/srp/srp.h> /* for srp_hw_interface_class */
+#include <vnet/api_errno.h> /* for API error numbers */
+
+/* This is really, really simple but stupid fib. */
+u32
+ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index,
+ ip4_address_t * dst,
+ u32 disable_default_route)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip4_fib_t * fib = vec_elt_at_index (im->fibs, fib_index);
+ uword * p, * hash, key;
+ i32 i, i_min, dst_address, ai;
+
+ i_min = disable_default_route ? 1 : 0;
+ dst_address = clib_mem_unaligned (&dst->data_u32, u32);
+ for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= i_min; i--)
+ {
+ hash = fib->adj_index_by_dst_address[i];
+ if (! hash)
+ continue;
+
+ key = dst_address & im->fib_masks[i];
+ if ((p = hash_get (hash, key)) != 0)
+ {
+ ai = p[0];
+ goto done;
+ }
+ }
+
+ /* Nothing matches in table. */
+ ai = lm->miss_adj_index;
+
+ done:
+ return ai;
+}
+
+static ip4_fib_t *
+create_fib_with_table_id (ip4_main_t * im, u32 table_id)
+{
+ ip4_fib_t * fib;
+ hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs));
+ vec_add2 (im->fibs, fib, 1);
+ fib->table_id = table_id;
+ fib->index = fib - im->fibs;
+ fib->flow_hash_config = IP_FLOW_HASH_DEFAULT;
+ fib->fwd_classify_table_index = ~0;
+ fib->rev_classify_table_index = ~0;
+ ip4_mtrie_init (&fib->mtrie);
+ return fib;
+}
+
+ip4_fib_t *
+find_ip4_fib_by_table_index_or_id (ip4_main_t * im,
+ u32 table_index_or_id, u32 flags)
+{
+ uword * p, fib_index;
+
+ fib_index = table_index_or_id;
+ if (! (flags & IP4_ROUTE_FLAG_FIB_INDEX))
+ {
+ p = hash_get (im->fib_index_by_table_id, table_index_or_id);
+ if (! p)
+ return create_fib_with_table_id (im, table_index_or_id);
+ fib_index = p[0];
+ }
+ return vec_elt_at_index (im->fibs, fib_index);
+}
+
+static void
+ip4_fib_init_adj_index_by_dst_address (ip_lookup_main_t * lm,
+ ip4_fib_t * fib,
+ u32 address_length)
+{
+ hash_t * h;
+ uword max_index;
+
+ ASSERT (lm->fib_result_n_bytes >= sizeof (uword));
+ lm->fib_result_n_words = round_pow2 (lm->fib_result_n_bytes, sizeof (uword)) / sizeof (uword);
+
+ fib->adj_index_by_dst_address[address_length] =
+ hash_create (32 /* elts */, lm->fib_result_n_words * sizeof (uword));
+
+ hash_set_flags (fib->adj_index_by_dst_address[address_length],
+ HASH_FLAG_NO_AUTO_SHRINK);
+
+ h = hash_header (fib->adj_index_by_dst_address[address_length]);
+ max_index = (hash_value_bytes (h) / sizeof (fib->new_hash_values[0])) - 1;
+
+ /* Initialize new/old hash value vectors. */
+ vec_validate_init_empty (fib->new_hash_values, max_index, ~0);
+ vec_validate_init_empty (fib->old_hash_values, max_index, ~0);
+}
+
+static void serialize_ip4_address (serialize_main_t * m, va_list * va)
+{
+ ip4_address_t * a = va_arg (*va, ip4_address_t *);
+ u8 * p = serialize_get (m, sizeof (a->as_u8));
+ memcpy (p, a->as_u8, sizeof (a->as_u8));
+}
+
+static void unserialize_ip4_address (serialize_main_t * m, va_list * va)
+{
+ ip4_address_t * a = va_arg (*va, ip4_address_t *);
+ u8 * p = unserialize_get (m, sizeof (a->as_u8));
+ memcpy (a->as_u8, p, sizeof (a->as_u8));
+}
+
+static void serialize_ip4_address_and_length (serialize_main_t * m, va_list * va)
+{
+ ip4_address_t * a = va_arg (*va, ip4_address_t *);
+ u32 l = va_arg (*va, u32);
+ u32 n_bytes = (l / 8) + ((l % 8) != 0);
+ u8 * p = serialize_get (m, 1 + n_bytes);
+ ASSERT (l <= 32);
+ p[0] = l;
+ memcpy (p + 1, a->as_u8, n_bytes);
+}
+
+static void unserialize_ip4_address_and_length (serialize_main_t * m, va_list * va)
+{
+ ip4_address_t * a = va_arg (*va, ip4_address_t *);
+ u32 * al = va_arg (*va, u32 *);
+ u8 * p = unserialize_get (m, 1);
+ u32 l, n_bytes;
+
+ al[0] = l = p[0];
+ ASSERT (l <= 32);
+ n_bytes = (l / 8) + ((l % 8) != 0);
+
+ if (n_bytes)
+ {
+ p = unserialize_get (m, n_bytes);
+ memcpy (a->as_u8, p, n_bytes);
+ }
+}
+
+static void serialize_ip4_add_del_route_msg (serialize_main_t * m, va_list * va)
+{
+ ip4_add_del_route_args_t * a = va_arg (*va, ip4_add_del_route_args_t *);
+
+ serialize_likely_small_unsigned_integer (m, a->table_index_or_table_id);
+ serialize_likely_small_unsigned_integer (m, a->flags);
+ serialize (m, serialize_ip4_address_and_length, &a->dst_address, a->dst_address_length);
+ serialize_likely_small_unsigned_integer (m, a->adj_index);
+ serialize_likely_small_unsigned_integer (m, a->n_add_adj);
+ if (a->n_add_adj > 0)
+ serialize (m, serialize_vec_ip_adjacency, a->add_adj, a->n_add_adj);
+}
+
+/* Serialized adjacencies for arp/rewrite do not send graph next_index
+ since graph hookup is not guaranteed to be the same for both sides
+ of serialize/unserialize. */
+static void
+unserialize_fixup_ip4_rewrite_adjacencies (vlib_main_t * vm,
+ ip_adjacency_t * adj,
+ u32 n_adj)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 i, ni, sw_if_index, is_arp;
+ vnet_hw_interface_t * hw;
+
+ for (i = 0; i < n_adj; i++)
+ {
+ switch (adj[i].lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_REWRITE:
+ case IP_LOOKUP_NEXT_ARP:
+ is_arp = adj[i].lookup_next_index == IP_LOOKUP_NEXT_ARP;
+ sw_if_index = adj[i].rewrite_header.sw_if_index;
+ hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ ni = is_arp ? ip4_arp_node.index : ip4_rewrite_node.index;
+ adj[i].rewrite_header.node_index = ni;
+ adj[i].rewrite_header.next_index = vlib_node_add_next (vm, ni, hw->output_node_index);
+ if (is_arp)
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_ARP,
+ sw_if_index,
+ ni,
+ VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+ &adj[i].rewrite_header,
+ sizeof (adj->rewrite_data));
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static void unserialize_ip4_add_del_route_msg (serialize_main_t * m, va_list * va)
+{
+ ip4_main_t * i4m = &ip4_main;
+ ip4_add_del_route_args_t a;
+
+ a.table_index_or_table_id = unserialize_likely_small_unsigned_integer (m);
+ a.flags = unserialize_likely_small_unsigned_integer (m);
+ unserialize (m, unserialize_ip4_address_and_length, &a.dst_address, &a.dst_address_length);
+ a.adj_index = unserialize_likely_small_unsigned_integer (m);
+ a.n_add_adj = unserialize_likely_small_unsigned_integer (m);
+ a.add_adj = 0;
+ if (a.n_add_adj > 0)
+ {
+ vec_resize (a.add_adj, a.n_add_adj);
+ unserialize (m, unserialize_vec_ip_adjacency, a.add_adj, a.n_add_adj);
+ unserialize_fixup_ip4_rewrite_adjacencies (vlib_get_main(),
+ a.add_adj, a.n_add_adj);
+ }
+
+ /* Prevent re-re-distribution. */
+ a.flags |= IP4_ROUTE_FLAG_NO_REDISTRIBUTE;
+
+ ip4_add_del_route (i4m, &a);
+
+ vec_free (a.add_adj);
+}
+
+MC_SERIALIZE_MSG (ip4_add_del_route_msg, static) = {
+ .name = "vnet_ip4_add_del_route",
+ .serialize = serialize_ip4_add_del_route_msg,
+ .unserialize = unserialize_ip4_add_del_route_msg,
+};
+
+static void
+ip4_fib_set_adj_index (ip4_main_t * im,
+ ip4_fib_t * fib,
+ u32 flags,
+ u32 dst_address_u32,
+ u32 dst_address_length,
+ u32 adj_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ uword * hash;
+
+ if (vec_bytes(fib->old_hash_values))
+ memset (fib->old_hash_values, ~0, vec_bytes (fib->old_hash_values));
+ if (vec_bytes(fib->new_hash_values))
+ memset (fib->new_hash_values, ~0, vec_bytes (fib->new_hash_values));
+ fib->new_hash_values[0] = adj_index;
+
+ /* Make sure adj index is valid. */
+ if (CLIB_DEBUG > 0)
+ (void) ip_get_adjacency (lm, adj_index);
+
+ hash = fib->adj_index_by_dst_address[dst_address_length];
+
+ hash = _hash_set3 (hash, dst_address_u32,
+ fib->new_hash_values,
+ fib->old_hash_values);
+
+ fib->adj_index_by_dst_address[dst_address_length] = hash;
+
+ if (vec_len (im->add_del_route_callbacks) > 0)
+ {
+ ip4_add_del_route_callback_t * cb;
+ ip4_address_t d;
+ uword * p;
+
+ d.data_u32 = dst_address_u32;
+ vec_foreach (cb, im->add_del_route_callbacks)
+ if ((flags & cb->required_flags) == cb->required_flags)
+ cb->function (im, cb->function_opaque,
+ fib, flags,
+ &d, dst_address_length,
+ fib->old_hash_values,
+ fib->new_hash_values);
+
+ p = hash_get (hash, dst_address_u32);
+ memcpy (p, fib->new_hash_values, vec_bytes (fib->new_hash_values));
+ }
+}
+
+void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * a)
+{
+ vlib_main_t * vm = vlib_get_main();
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip4_fib_t * fib;
+ u32 dst_address, dst_address_length, adj_index, old_adj_index;
+ uword * hash, is_del;
+ ip4_add_del_route_callback_t * cb;
+
+ if (vm->mc_main && ! (a->flags & IP4_ROUTE_FLAG_NO_REDISTRIBUTE))
+ {
+ u32 multiple_messages_per_vlib_buffer = (a->flags & IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP);
+ mc_serialize2 (vm->mc_main, multiple_messages_per_vlib_buffer,
+ &ip4_add_del_route_msg, a);
+ return;
+ }
+
+ /* Either create new adjacency or use given one depending on arguments. */
+ if (a->n_add_adj > 0)
+ {
+ ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
+ ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
+ }
+ else
+ adj_index = a->adj_index;
+
+ dst_address = a->dst_address.data_u32;
+ dst_address_length = a->dst_address_length;
+ fib = find_ip4_fib_by_table_index_or_id (im, a->table_index_or_table_id, a->flags);
+
+ ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
+ dst_address &= im->fib_masks[dst_address_length];
+
+ if (! fib->adj_index_by_dst_address[dst_address_length])
+ ip4_fib_init_adj_index_by_dst_address (lm, fib, dst_address_length);
+
+ hash = fib->adj_index_by_dst_address[dst_address_length];
+
+ is_del = (a->flags & IP4_ROUTE_FLAG_DEL) != 0;
+
+ if (is_del)
+ {
+ fib->old_hash_values[0] = ~0;
+ hash = _hash_unset (hash, dst_address, fib->old_hash_values);
+ fib->adj_index_by_dst_address[dst_address_length] = hash;
+
+ if (vec_len (im->add_del_route_callbacks) > 0
+ && fib->old_hash_values[0] != ~0) /* make sure destination was found in hash */
+ {
+ fib->new_hash_values[0] = ~0;
+ vec_foreach (cb, im->add_del_route_callbacks)
+ if ((a->flags & cb->required_flags) == cb->required_flags)
+ cb->function (im, cb->function_opaque,
+ fib, a->flags,
+ &a->dst_address, dst_address_length,
+ fib->old_hash_values,
+ fib->new_hash_values);
+ }
+ }
+ else
+ ip4_fib_set_adj_index (im, fib, a->flags, dst_address, dst_address_length,
+ adj_index);
+
+ old_adj_index = fib->old_hash_values[0];
+
+ ip4_fib_mtrie_add_del_route (fib, a->dst_address, dst_address_length,
+ is_del ? old_adj_index : adj_index,
+ is_del);
+
+ /* Delete old adjacency index if present and changed. */
+ if (! (a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
+ && old_adj_index != ~0
+ && old_adj_index != adj_index)
+ ip_del_adjacency (lm, old_adj_index);
+}
+
+static void serialize_ip4_add_del_route_next_hop_msg (serialize_main_t * m, va_list * va)
+{
+ u32 flags = va_arg (*va, u32);
+ ip4_address_t * dst_address = va_arg (*va, ip4_address_t *);
+ u32 dst_address_length = va_arg (*va, u32);
+ ip4_address_t * next_hop_address = va_arg (*va, ip4_address_t *);
+ u32 next_hop_sw_if_index = va_arg (*va, u32);
+ u32 next_hop_weight = va_arg (*va, u32);
+
+ serialize_likely_small_unsigned_integer (m, flags);
+ serialize (m, serialize_ip4_address_and_length, dst_address, dst_address_length);
+ serialize (m, serialize_ip4_address, next_hop_address);
+ serialize_likely_small_unsigned_integer (m, next_hop_sw_if_index);
+ serialize_likely_small_unsigned_integer (m, next_hop_weight);
+}
+
+static void unserialize_ip4_add_del_route_next_hop_msg (serialize_main_t * m, va_list * va)
+{
+ ip4_main_t * im = &ip4_main;
+ u32 flags, dst_address_length, next_hop_sw_if_index, next_hop_weight;
+ ip4_address_t dst_address, next_hop_address;
+
+ flags = unserialize_likely_small_unsigned_integer (m);
+ unserialize (m, unserialize_ip4_address_and_length, &dst_address, &dst_address_length);
+ unserialize (m, unserialize_ip4_address, &next_hop_address);
+ next_hop_sw_if_index = unserialize_likely_small_unsigned_integer (m);
+ next_hop_weight = unserialize_likely_small_unsigned_integer (m);
+
+ ip4_add_del_route_next_hop
+ (im,
+ flags | IP4_ROUTE_FLAG_NO_REDISTRIBUTE,
+ &dst_address,
+ dst_address_length,
+ &next_hop_address,
+ next_hop_sw_if_index,
+ next_hop_weight, (u32)~0,
+ (u32)~0 /* explicit FIB index */);
+}
+
+MC_SERIALIZE_MSG (ip4_add_del_route_next_hop_msg, static) = {
+ .name = "vnet_ip4_add_del_route_next_hop",
+ .serialize = serialize_ip4_add_del_route_next_hop_msg,
+ .unserialize = unserialize_ip4_add_del_route_next_hop_msg,
+};
+
+void
+ip4_add_del_route_next_hop (ip4_main_t * im,
+ u32 flags,
+ ip4_address_t * dst_address,
+ u32 dst_address_length,
+ ip4_address_t * next_hop,
+ u32 next_hop_sw_if_index,
+ u32 next_hop_weight, u32 adj_index,
+ u32 explicit_fib_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = vlib_get_main();
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 fib_index;
+ ip4_fib_t * fib;
+ u32 dst_address_u32, old_mp_adj_index, new_mp_adj_index;
+ u32 dst_adj_index, nh_adj_index;
+ uword * dst_hash, * dst_result;
+ uword * nh_hash, * nh_result;
+ ip_adjacency_t * dst_adj;
+ ip_multipath_adjacency_t * old_mp, * new_mp;
+ int is_del = (flags & IP4_ROUTE_FLAG_DEL) != 0;
+ int is_interface_next_hop;
+ clib_error_t * error = 0;
+
+ if (vm->mc_main && ! (flags & IP4_ROUTE_FLAG_NO_REDISTRIBUTE))
+ {
+ u32 multiple_messages_per_vlib_buffer = (flags & IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP);
+ mc_serialize2 (vm->mc_main,
+ multiple_messages_per_vlib_buffer,
+ &ip4_add_del_route_next_hop_msg,
+ flags,
+ dst_address, dst_address_length,
+ next_hop, next_hop_sw_if_index, next_hop_weight);
+ return;
+ }
+
+ if (explicit_fib_index == (u32)~0)
+ fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index);
+ else
+ fib_index = explicit_fib_index;
+
+ fib = vec_elt_at_index (im->fibs, fib_index);
+
+ /* Lookup next hop to be added or deleted. */
+ is_interface_next_hop = next_hop->data_u32 == 0;
+ if (adj_index == (u32)~0)
+ {
+ if (is_interface_next_hop)
+ {
+ nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index);
+ if (nh_result)
+ nh_adj_index = *nh_result;
+ else
+ {
+ ip_adjacency_t * adj;
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &nh_adj_index);
+ ip4_adjacency_set_interface_route (vnm, adj, next_hop_sw_if_index, /* if_address_index */ ~0);
+ ip_call_add_del_adjacency_callbacks (lm, nh_adj_index, /* is_del */ 0);
+ hash_set (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index, nh_adj_index);
+ }
+ }
+ else
+ {
+ nh_hash = fib->adj_index_by_dst_address[32];
+ nh_result = hash_get (nh_hash, next_hop->data_u32);
+
+ /* Next hop must be known. */
+ if (! nh_result)
+ {
+ vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB;
+ error = clib_error_return (0, "next-hop %U/32 not in FIB",
+ format_ip4_address, next_hop);
+ goto done;
+ }
+ nh_adj_index = *nh_result;
+ }
+ }
+ else
+ {
+ nh_adj_index = adj_index;
+ }
+ ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
+ dst_address_u32 = dst_address->data_u32 & im->fib_masks[dst_address_length];
+
+ dst_hash = fib->adj_index_by_dst_address[dst_address_length];
+ dst_result = hash_get (dst_hash, dst_address_u32);
+ if (dst_result)
+ {
+ dst_adj_index = dst_result[0];
+ dst_adj = ip_get_adjacency (lm, dst_adj_index);
+ }
+ else
+ {
+ /* For deletes destination must be known. */
+ if (is_del)
+ {
+ vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
+ error = clib_error_return (0, "unknown destination %U/%d",
+ format_ip4_address, dst_address,
+ dst_address_length);
+ goto done;
+ }
+
+ dst_adj_index = ~0;
+ dst_adj = 0;
+ }
+
+ /* Ignore adds of X/32 with next hop of X. */
+ if (! is_del
+ && dst_address_length == 32
+ && dst_address->data_u32 == next_hop->data_u32
+ && adj_index != (u32)~0)
+ {
+ vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP;
+ error = clib_error_return (0, "prefix matches next hop %U/%d",
+ format_ip4_address, dst_address,
+ dst_address_length);
+ goto done;
+ }
+
+ old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0;
+
+ if (! ip_multipath_adjacency_add_del_next_hop
+ (lm, is_del,
+ old_mp_adj_index,
+ nh_adj_index,
+ next_hop_weight,
+ &new_mp_adj_index))
+ {
+ vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP;
+ error = clib_error_return (0, "requested deleting next-hop %U not found in multi-path",
+ format_ip4_address, next_hop);
+ goto done;
+ }
+
+ old_mp = new_mp = 0;
+ if (old_mp_adj_index != ~0)
+ old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
+ if (new_mp_adj_index != ~0)
+ new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index);
+
+ if (old_mp != new_mp)
+ {
+ ip4_add_del_route_args_t a;
+ a.table_index_or_table_id = fib_index;
+ a.flags = ((is_del && ! new_mp ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD)
+ | IP4_ROUTE_FLAG_FIB_INDEX
+ | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY
+ | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP)));
+ a.dst_address = dst_address[0];
+ a.dst_address_length = dst_address_length;
+ a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index;
+ a.add_adj = 0;
+ a.n_add_adj = 0;
+
+ ip4_add_del_route (im, &a);
+ }
+
+ done:
+ if (error)
+ clib_error_report (error);
+}
+
+void *
+ip4_get_route (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ u8 * address,
+ u32 address_length)
+{
+ ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+ u32 dst_address = * (u32 *) address;
+ uword * hash, * p;
+
+ ASSERT (address_length < ARRAY_LEN (im->fib_masks));
+ dst_address &= im->fib_masks[address_length];
+
+ hash = fib->adj_index_by_dst_address[address_length];
+ p = hash_get (hash, dst_address);
+ return (void *) p;
+}
+
+void
+ip4_foreach_matching_route (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip4_address_t * address,
+ u32 address_length,
+ ip4_address_t ** results,
+ u8 ** result_lengths)
+{
+ ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+ u32 dst_address = address->data_u32;
+ u32 this_length = address_length;
+
+ if (*results)
+ _vec_len (*results) = 0;
+ if (*result_lengths)
+ _vec_len (*result_lengths) = 0;
+
+ while (this_length <= 32 && vec_len (results) == 0)
+ {
+ uword k, v;
+ hash_foreach (k, v, fib->adj_index_by_dst_address[this_length], ({
+ if (0 == ((k ^ dst_address) & im->fib_masks[address_length]))
+ {
+ ip4_address_t a;
+ a.data_u32 = k;
+ vec_add1 (*results, a);
+ vec_add1 (*result_lengths, this_length);
+ }
+ }));
+
+ this_length++;
+ }
+}
+
+void ip4_maybe_remap_adjacencies (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags)
+{
+ ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 i, l;
+ ip4_address_t a;
+ ip4_add_del_route_callback_t * cb;
+ static ip4_address_t * to_delete;
+
+ if (lm->n_adjacency_remaps == 0)
+ return;
+
+ for (l = 0; l <= 32; l++)
+ {
+ hash_pair_t * p;
+ uword * hash = fib->adj_index_by_dst_address[l];
+
+ if (hash_elts (hash) == 0)
+ continue;
+
+ if (to_delete)
+ _vec_len (to_delete) = 0;
+
+ hash_foreach_pair (p, hash, ({
+ u32 adj_index = p->value[0];
+ u32 m = vec_elt (lm->adjacency_remap_table, adj_index);
+
+ if (m)
+ {
+ /* Record destination address from hash key. */
+ a.data_u32 = p->key;
+
+ /* New adjacency points to nothing: so delete prefix. */
+ if (m == ~0)
+ vec_add1 (to_delete, a);
+ else
+ {
+ /* Remap to new adjacency. */
+ memcpy (fib->old_hash_values, p->value, vec_bytes (fib->old_hash_values));
+
+ /* Set new adjacency value. */
+ fib->new_hash_values[0] = p->value[0] = m - 1;
+
+ vec_foreach (cb, im->add_del_route_callbacks)
+ if ((flags & cb->required_flags) == cb->required_flags)
+ cb->function (im, cb->function_opaque,
+ fib, flags | IP4_ROUTE_FLAG_ADD,
+ &a, l,
+ fib->old_hash_values,
+ fib->new_hash_values);
+ }
+ }
+ }));
+
+ fib->new_hash_values[0] = ~0;
+ for (i = 0; i < vec_len (to_delete); i++)
+ {
+ hash = _hash_unset (hash, to_delete[i].data_u32, fib->old_hash_values);
+ vec_foreach (cb, im->add_del_route_callbacks)
+ if ((flags & cb->required_flags) == cb->required_flags)
+ cb->function (im, cb->function_opaque,
+ fib, flags | IP4_ROUTE_FLAG_DEL,
+ &a, l,
+ fib->old_hash_values,
+ fib->new_hash_values);
+ }
+ }
+
+ /* Also remap adjacencies in mtrie. */
+ ip4_mtrie_maybe_remap_adjacencies (lm, &fib->mtrie);
+
+ /* Reset mapping table. */
+ vec_zero (lm->adjacency_remap_table);
+
+ /* All remaps have been performed. */
+ lm->n_adjacency_remaps = 0;
+}
+
+void ip4_delete_matching_routes (ip4_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip4_address_t * address,
+ u32 address_length)
+{
+ static ip4_address_t * matching_addresses;
+ static u8 * matching_address_lengths;
+ u32 l, i;
+ ip4_add_del_route_args_t a;
+
+ a.flags = IP4_ROUTE_FLAG_DEL | IP4_ROUTE_FLAG_NO_REDISTRIBUTE | flags;
+ a.table_index_or_table_id = table_index_or_table_id;
+ a.adj_index = ~0;
+ a.add_adj = 0;
+ a.n_add_adj = 0;
+
+ for (l = address_length + 1; l <= 32; l++)
+ {
+ ip4_foreach_matching_route (im, table_index_or_table_id, flags,
+ address,
+ l,
+ &matching_addresses,
+ &matching_address_lengths);
+ for (i = 0; i < vec_len (matching_addresses); i++)
+ {
+ a.dst_address = matching_addresses[i];
+ a.dst_address_length = matching_address_lengths[i];
+ ip4_add_del_route (im, &a);
+ }
+ }
+
+ ip4_maybe_remap_adjacencies (im, table_index_or_table_id, flags);
+}
+
+always_inline uword
+ip4_lookup_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ int lookup_for_responses_to_locally_received_packets)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+ u32 n_left_from, n_left_to_next, * from, * to_next;
+ ip_lookup_next_t next;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ __attribute__((unused)) tcp_header_t * tcp0, * tcp1;
+ ip_lookup_next_t next0, next1;
+ ip_adjacency_t * adj0, * adj1;
+ ip4_fib_mtrie_t * mtrie0, * mtrie1;
+ ip4_fib_mtrie_leaf_t leaf0, leaf1;
+ __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
+ __attribute__((unused)) u32 pi1, fib_index1, adj_index1, is_tcp_udp1;
+ u32 flow_hash_config0, flow_hash_config1;
+ u32 hash_c0, hash_c1;
+ u32 wrong_next;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+ fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
+
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ {
+ mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+ mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
+
+ leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 0);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 0);
+ }
+
+ tcp0 = (void *) (ip0 + 1);
+ tcp1 = (void *) (ip1 + 1);
+
+ is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
+ || ip0->protocol == IP_PROTOCOL_UDP);
+ is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP
+ || ip1->protocol == IP_PROTOCOL_UDP);
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ {
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 1);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 1);
+ }
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ {
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 2);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 2);
+ }
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ {
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 3);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 3);
+ }
+
+ if (lookup_for_responses_to_locally_received_packets)
+ {
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
+ adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX];
+ }
+ else
+ {
+ /* Handle default route. */
+ leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+ leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1);
+
+ adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+ adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+ }
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
+ &ip0->dst_address,
+ /* no_default_route */ 0));
+ ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
+ &ip1->dst_address,
+ /* no_default_route */ 0));
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ next0 = adj0->lookup_next_index;
+ next1 = adj1->lookup_next_index;
+
+ /* Use flow hash to compute multipath adjacency. */
+ hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
+ hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0;
+ if (PREDICT_FALSE (adj0->n_adj > 1))
+ {
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+ hash_c0 = vnet_buffer (p0)->ip.flow_hash =
+ ip4_compute_flow_hash (ip0, flow_hash_config0);
+ }
+ if (PREDICT_FALSE(adj1->n_adj > 1))
+ {
+ flow_hash_config1 =
+ vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
+ hash_c1 = vnet_buffer (p1)->ip.flow_hash =
+ ip4_compute_flow_hash (ip1, flow_hash_config1);
+ }
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (adj1->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ ASSERT (is_pow2 (adj1->n_adj));
+ adj_index0 += (hash_c0 & (adj0->n_adj - 1));
+ adj_index1 += (hash_c1 & (adj1->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+ vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0)
+ + sizeof(ethernet_header_t));
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index1, 1,
+ vlib_buffer_length_in_chain (vm, p1)
+ + sizeof(ethernet_header_t));
+
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ wrong_next = (next0 != next) + 2*(next1 != next);
+ if (PREDICT_FALSE (wrong_next != 0))
+ {
+ switch (wrong_next)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = pi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ break;
+
+ case 3:
+ /* A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ if (next0 == next1)
+ {
+ /* A B B */
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next1;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ __attribute__((unused)) tcp_header_t * tcp0;
+ ip_lookup_next_t next0;
+ ip_adjacency_t * adj0;
+ ip4_fib_mtrie_t * mtrie0;
+ ip4_fib_mtrie_leaf_t leaf0;
+ __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0;
+ u32 flow_hash_config0, hash_c0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ {
+ mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+
+ leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 0);
+ }
+
+ tcp0 = (void *) (ip0 + 1);
+
+ is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP
+ || ip0->protocol == IP_PROTOCOL_UDP);
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 1);
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 2);
+
+ if (! lookup_for_responses_to_locally_received_packets)
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 3);
+
+ if (lookup_for_responses_to_locally_received_packets)
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX];
+ else
+ {
+ /* Handle default route. */
+ leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+ adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+ }
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
+ &ip0->dst_address,
+ /* no_default_route */ 0));
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ next0 = adj0->lookup_next_index;
+
+ /* Use flow hash to compute multipath adjacency. */
+ hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0;
+ if (PREDICT_FALSE(adj0->n_adj > 1))
+ {
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+
+ hash_c0 = vnet_buffer (p0)->ip.flow_hash =
+ ip4_compute_flow_hash (ip0, flow_hash_config0);
+ }
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ adj_index0 += (hash_c0 & (adj0->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0)
+ + sizeof(ethernet_header_t));
+
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ if (PREDICT_FALSE (next0 != next))
+ {
+ n_left_to_next += 1;
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next0;
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+ to_next[0] = pi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_lookup (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_lookup_inline (vm, node, frame, /* lookup_for_responses_to_locally_received_packets */ 0);
+
+}
+
+void ip4_adjacency_set_interface_route (vnet_main_t * vnm,
+ ip_adjacency_t * adj,
+ u32 sw_if_index,
+ u32 if_address_index)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ ip_lookup_next_t n;
+ vnet_l3_packet_type_t packet_type;
+ u32 node_index;
+
+ if (hw->hw_class_index == ethernet_hw_interface_class.index
+ || hw->hw_class_index == srp_hw_interface_class.index)
+ {
+ /*
+ * We have a bit of a problem in this case. ip4-arp uses
+ * the rewrite_header.next_index to hand pkts to the
+ * indicated inteface output node. We can end up in
+ * ip4_rewrite_local, too, which also pays attention to
+ * rewrite_header.next index. Net result: a hack in
+ * ip4_rewrite_local...
+ */
+ n = IP_LOOKUP_NEXT_ARP;
+ node_index = ip4_arp_node.index;
+ adj->if_address_index = if_address_index;
+ packet_type = VNET_L3_PACKET_TYPE_ARP;
+ }
+ else
+ {
+ n = IP_LOOKUP_NEXT_REWRITE;
+ node_index = ip4_rewrite_node.index;
+ packet_type = VNET_L3_PACKET_TYPE_IP4;
+ }
+
+ adj->lookup_next_index = n;
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ packet_type,
+ sw_if_index,
+ node_index,
+ VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+ &adj->rewrite_header,
+ sizeof (adj->rewrite_data));
+}
+
+static void
+ip4_add_interface_routes (u32 sw_if_index,
+ ip4_main_t * im, u32 fib_index,
+ ip_interface_address_t * a)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_adjacency_t * adj;
+ ip4_address_t * address = ip_interface_address_get_address (lm, a);
+ ip4_add_del_route_args_t x;
+ vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ u32 classify_table_index;
+
+ /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
+ x.table_index_or_table_id = fib_index;
+ x.flags = (IP4_ROUTE_FLAG_ADD
+ | IP4_ROUTE_FLAG_FIB_INDEX
+ | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
+ x.dst_address = address[0];
+ x.dst_address_length = a->address_length;
+ x.n_add_adj = 0;
+ x.add_adj = 0;
+
+ a->neighbor_probe_adj_index = ~0;
+ if (a->address_length < 32)
+ {
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &x.adj_index);
+ ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool);
+ ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
+ ip4_add_del_route (im, &x);
+ a->neighbor_probe_adj_index = x.adj_index;
+ }
+
+ /* Add e.g. 1.1.1.1/32 as local to this host. */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &x.adj_index);
+
+ classify_table_index = ~0;
+ if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
+ classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index];
+ if (classify_table_index != (u32) ~0)
+ {
+ adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
+ adj->classify_table_index = classify_table_index;
+ }
+ else
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+
+ adj->if_address_index = a - lm->if_address_pool;
+ adj->rewrite_header.sw_if_index = sw_if_index;
+ adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX];
+ /*
+ * Local adjs are never to be rewritten. Spoofed pkts w/ src = dst = local
+ * fail an RPF-ish check, but still go thru the rewrite code...
+ */
+ adj->rewrite_header.data_bytes = 0;
+
+ ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
+ x.dst_address_length = 32;
+ ip4_add_del_route (im, &x);
+}
+
+static void
+ip4_del_interface_routes (ip4_main_t * im, u32 fib_index, ip4_address_t * address, u32 address_length)
+{
+ ip4_add_del_route_args_t x;
+
+ /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
+ x.table_index_or_table_id = fib_index;
+ x.flags = (IP4_ROUTE_FLAG_DEL
+ | IP4_ROUTE_FLAG_FIB_INDEX
+ | IP4_ROUTE_FLAG_NO_REDISTRIBUTE);
+ x.dst_address = address[0];
+ x.dst_address_length = address_length;
+ x.adj_index = ~0;
+ x.n_add_adj = 0;
+ x.add_adj = 0;
+
+ if (address_length < 32)
+ ip4_add_del_route (im, &x);
+
+ x.dst_address_length = 32;
+ ip4_add_del_route (im, &x);
+
+ ip4_delete_matching_routes (im,
+ fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX,
+ address,
+ address_length);
+}
+
+typedef struct {
+ u32 sw_if_index;
+ ip4_address_t address;
+ u32 length;
+} ip4_interface_address_t;
+
+static void serialize_vec_ip4_set_interface_address (serialize_main_t * m, va_list * va)
+{
+ ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++) {
+ serialize_integer (m, a[i].sw_if_index, sizeof (a[i].sw_if_index));
+ serialize (m, serialize_ip4_address, &a[i].address);
+ serialize_integer (m, a[i].length, sizeof (a[i].length));
+ }
+}
+
+static void unserialize_vec_ip4_set_interface_address (serialize_main_t * m, va_list * va)
+{
+ ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++) {
+ unserialize_integer (m, &a[i].sw_if_index, sizeof (a[i].sw_if_index));
+ unserialize (m, unserialize_ip4_address, &a[i].address);
+ unserialize_integer (m, &a[i].length, sizeof (a[i].length));
+ }
+}
+
+static void serialize_ip4_set_interface_address_msg (serialize_main_t * m, va_list * va)
+{
+ ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *);
+ int is_del = va_arg (*va, int);
+ serialize (m, serialize_vec_ip4_set_interface_address, a, 1);
+ serialize_integer (m, is_del, sizeof (is_del));
+}
+
+static clib_error_t *
+ip4_add_del_interface_address_internal (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip4_address_t * new_address,
+ u32 new_length,
+ u32 redistribute,
+ u32 insert_routes,
+ u32 is_del);
+
+static void unserialize_ip4_set_interface_address_msg (serialize_main_t * m, va_list * va)
+{
+ mc_main_t * mcm = va_arg (*va, mc_main_t *);
+ vlib_main_t * vm = mcm->vlib_main;
+ ip4_interface_address_t a;
+ clib_error_t * error;
+ int is_del;
+
+ unserialize (m, unserialize_vec_ip4_set_interface_address, &a, 1);
+ unserialize_integer (m, &is_del, sizeof (is_del));
+ error = ip4_add_del_interface_address_internal
+ (vm, a.sw_if_index, &a.address, a.length,
+ /* redistribute */ 0,
+ /* insert_routes */ 1,
+ is_del);
+ if (error)
+ clib_error_report (error);
+}
+
+MC_SERIALIZE_MSG (ip4_set_interface_address_msg, static) = {
+ .name = "vnet_ip4_set_interface_address",
+ .serialize = serialize_ip4_set_interface_address_msg,
+ .unserialize = unserialize_ip4_set_interface_address_msg,
+};
+
+static clib_error_t *
+ip4_add_del_interface_address_internal (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip4_address_t * address,
+ u32 address_length,
+ u32 redistribute,
+ u32 insert_routes,
+ u32 is_del)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ clib_error_t * error = 0;
+ u32 if_address_index, elts_before;
+ ip4_address_fib_t ip4_af, * addr_fib = 0;
+
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+ ip4_addr_fib_init (&ip4_af, address,
+ vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
+ vec_add1 (addr_fib, ip4_af);
+
+ /* When adding an address check that it does not conflict with an existing address. */
+ if (! is_del)
+ {
+ ip_interface_address_t * ia;
+ foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ ip4_address_t * x = ip_interface_address_get_address (&im->lookup_main, ia);
+
+ if (ip4_destination_matches_route (im, address, x, ia->address_length)
+ || ip4_destination_matches_route (im, x, address, address_length))
+ return clib_error_create ("failed to add %U which conflicts with %U for interface %U",
+ format_ip4_address_and_length, address, address_length,
+ format_ip4_address_and_length, x, ia->address_length,
+ format_vnet_sw_if_index_name, vnm, sw_if_index);
+ }));
+ }
+
+ if (vm->mc_main && redistribute)
+ {
+ ip4_interface_address_t a;
+ a.sw_if_index = sw_if_index;
+ a.address = address[0];
+ a.length = address_length;
+ mc_serialize (vm->mc_main, &ip4_set_interface_address_msg,
+ &a, (int)is_del);
+ goto done;
+ }
+
+ elts_before = pool_elts (lm->if_address_pool);
+
+ error = ip_interface_address_add_del
+ (lm,
+ sw_if_index,
+ addr_fib,
+ address_length,
+ is_del,
+ &if_address_index);
+ if (error)
+ goto done;
+
+ if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes)
+ {
+ if (is_del)
+ ip4_del_interface_routes (im, ip4_af.fib_index, address,
+ address_length);
+
+ else
+ ip4_add_interface_routes (sw_if_index,
+ im, ip4_af.fib_index,
+ pool_elt_at_index
+ (lm->if_address_pool, if_address_index));
+ }
+
+ /* If pool did not grow/shrink: add duplicate address. */
+ if (elts_before != pool_elts (lm->if_address_pool))
+ {
+ ip4_add_del_interface_address_callback_t * cb;
+ vec_foreach (cb, im->add_del_interface_address_callbacks)
+ cb->function (im, cb->function_opaque, sw_if_index,
+ address, address_length,
+ if_address_index,
+ is_del);
+ }
+
+ done:
+ vec_free (addr_fib);
+ return error;
+}
+
+clib_error_t *
+ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
+ ip4_address_t * address, u32 address_length,
+ u32 is_del)
+{
+ return ip4_add_del_interface_address_internal
+ (vm, sw_if_index, address, address_length,
+ /* redistribute */ 1,
+ /* insert_routes */ 1,
+ is_del);
+}
+
+static void serialize_ip4_fib (serialize_main_t * m, va_list * va)
+{
+ ip4_fib_t * f = va_arg (*va, ip4_fib_t *);
+ u32 l, dst, adj_index;
+
+ serialize_integer (m, f->table_id, sizeof (f->table_id));
+ for (l = 0; l < ARRAY_LEN (f->adj_index_by_dst_address); l++)
+ {
+ u32 n_elts = hash_elts (f->adj_index_by_dst_address[l]);
+
+ serialize_integer (m, n_elts, sizeof (n_elts));
+ hash_foreach (dst, adj_index, f->adj_index_by_dst_address[l], ({
+ ip4_address_t tmp;
+ tmp.as_u32 = dst;
+ serialize (m, serialize_ip4_address, &tmp);
+ serialize_integer (m, adj_index, sizeof (adj_index));
+ }));
+ }
+}
+
+static void unserialize_ip4_fib (serialize_main_t * m, va_list * va)
+{
+ ip4_add_del_route_args_t a;
+ u32 i;
+
+ a.flags = (IP4_ROUTE_FLAG_ADD
+ | IP4_ROUTE_FLAG_NO_REDISTRIBUTE
+ | IP4_ROUTE_FLAG_TABLE_ID);
+ a.n_add_adj = 0;
+ a.add_adj = 0;
+
+ unserialize_integer (m, &a.table_index_or_table_id,
+ sizeof (a.table_index_or_table_id));
+
+ for (i = 0; i < STRUCT_ARRAY_LEN (ip4_fib_t, adj_index_by_dst_address); i++)
+ {
+ u32 n_elts;
+ unserialize_integer (m, &n_elts, sizeof (u32));
+ a.dst_address_length = i;
+ while (n_elts > 0)
+ {
+ unserialize (m, unserialize_ip4_address, &a.dst_address);
+ unserialize_integer (m, &a.adj_index, sizeof (a.adj_index));
+ ip4_add_del_route (&ip4_main, &a);
+ n_elts--;
+ }
+ }
+}
+
+void serialize_vnet_ip4_main (serialize_main_t * m, va_list * va)
+{
+ vnet_main_t * vnm = va_arg (*va, vnet_main_t *);
+ vnet_interface_main_t * vim = &vnm->interface_main;
+ vnet_sw_interface_t * si;
+ ip4_main_t * i4m = &ip4_main;
+ ip4_interface_address_t * as = 0, * a;
+
+ /* Download adjacency tables & multipath stuff. */
+ serialize (m, serialize_ip_lookup_main, &i4m->lookup_main);
+
+ /* FIBs. */
+ {
+ ip4_fib_t * f;
+ u32 n_fibs = vec_len (i4m->fibs);
+ serialize_integer (m, n_fibs, sizeof (n_fibs));
+ vec_foreach (f, i4m->fibs)
+ serialize (m, serialize_ip4_fib, f);
+ }
+
+ /* FIB interface config. */
+ vec_serialize (m, i4m->fib_index_by_sw_if_index, serialize_vec_32);
+
+ /* Interface ip4 addresses. */
+ pool_foreach (si, vim->sw_interfaces, ({
+ u32 sw_if_index = si->sw_if_index;
+ ip_interface_address_t * ia;
+ foreach_ip_interface_address (&i4m->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ ip4_address_t * x = ip_interface_address_get_address (&i4m->lookup_main, ia);
+ vec_add2 (as, a, 1);
+ a->address = x[0];
+ a->length = ia->address_length;
+ a->sw_if_index = sw_if_index;
+ }));
+ }));
+ vec_serialize (m, as, serialize_vec_ip4_set_interface_address);
+ vec_free (as);
+}
+
+void unserialize_vnet_ip4_main (serialize_main_t * m, va_list * va)
+{
+ vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+ ip4_main_t * i4m = &ip4_main;
+ ip4_interface_address_t * as = 0, * a;
+
+ unserialize (m, unserialize_ip_lookup_main, &i4m->lookup_main);
+
+ {
+ ip_adjacency_t * adj, * adj_heap;
+ u32 n_adj;
+ adj_heap = i4m->lookup_main.adjacency_heap;
+ heap_foreach (adj, n_adj, adj_heap, ({
+ unserialize_fixup_ip4_rewrite_adjacencies (vm, adj, n_adj);
+ ip_call_add_del_adjacency_callbacks (&i4m->lookup_main, adj - adj_heap, /* is_del */ 0);
+ }));
+ }
+
+ /* FIBs */
+ {
+ u32 i, n_fibs;
+ unserialize_integer (m, &n_fibs, sizeof (n_fibs));
+ for (i = 0; i < n_fibs; i++)
+ unserialize (m, unserialize_ip4_fib);
+ }
+
+ vec_unserialize (m, &i4m->fib_index_by_sw_if_index, unserialize_vec_32);
+
+ vec_unserialize (m, &as, unserialize_vec_ip4_set_interface_address);
+ vec_foreach (a, as) {
+ ip4_add_del_interface_address_internal
+ (vm, a->sw_if_index, &a->address, a->length,
+ /* redistribute */ 0,
+ /* insert_routes */ 0,
+ /* is_del */ 0);
+ }
+ vec_free (as);
+}
+
+static clib_error_t *
+ip4_sw_interface_admin_up_down (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 flags)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_interface_address_t * ia;
+ ip4_address_t * a;
+ u32 is_admin_up, fib_index;
+
+ /* Fill in lookup tables with default table (0). */
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+
+ vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
+
+ foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ a = ip_interface_address_get_address (&im->lookup_main, ia);
+ if (is_admin_up)
+ ip4_add_interface_routes (sw_if_index,
+ im, fib_index,
+ ia);
+ else
+ ip4_del_interface_routes (im, fib_index,
+ a, ia->address_length);
+ }));
+
+ return 0;
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down);
+
+static clib_error_t *
+ip4_sw_interface_add_del (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 is_add)
+{
+ vlib_main_t * vm = vnm->vlib_main;
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 ci, cast;
+
+ for (cast = 0; cast < VNET_N_CAST; cast++)
+ {
+ ip_config_main_t * cm = &lm->rx_config_mains[cast];
+ vnet_config_main_t * vcm = &cm->config_main;
+
+ if (! vcm->node_index_by_feature_index)
+ {
+ if (cast == VNET_UNICAST)
+ {
+ static char * start_nodes[] = { "ip4-input", "ip4-input-no-checksum", };
+ static char * feature_nodes[] = {
+ [IP4_RX_FEATURE_CHECK_ACCESS] = "ip4-inacl",
+ [IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX] = "ip4-source-check-via-rx",
+ [IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_ANY] = "ip4-source-check-via-any",
+ [IP4_RX_FEATURE_IPSEC] = "ipsec-input-ip4",
+ [IP4_RX_FEATURE_VPATH] = "vpath-input-ip4",
+ [IP4_RX_FEATURE_LOOKUP] = "ip4-lookup",
+ };
+
+ vnet_config_init (vm, vcm,
+ start_nodes, ARRAY_LEN (start_nodes),
+ feature_nodes, ARRAY_LEN (feature_nodes));
+ }
+ else
+ {
+ static char * start_nodes[] = { "ip4-input", "ip4-input-no-checksum", };
+ static char * feature_nodes[] = {
+ [IP4_RX_FEATURE_VPATH] = "vpath-input-ip4",
+ [IP4_RX_FEATURE_LOOKUP] = "ip4-lookup-multicast",
+ };
+
+ vnet_config_init (vm, vcm,
+ start_nodes, ARRAY_LEN (start_nodes),
+ feature_nodes, ARRAY_LEN (feature_nodes));
+ }
+ }
+
+ vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+ ci = cm->config_index_by_sw_if_index[sw_if_index];
+
+ if (is_add)
+ ci = vnet_config_add_feature (vm, vcm,
+ ci,
+ IP4_RX_FEATURE_LOOKUP,
+ /* config data */ 0,
+ /* # bytes of config data */ 0);
+ else
+ ci = vnet_config_del_feature (vm, vcm,
+ ci,
+ IP4_RX_FEATURE_LOOKUP,
+ /* config data */ 0,
+ /* # bytes of config data */ 0);
+
+ cm->config_index_by_sw_if_index[sw_if_index] = ci;
+ }
+
+ return /* no error */ 0;
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del);
+
+VLIB_REGISTER_NODE (ip4_lookup_node) = {
+ .function = ip4_lookup,
+ .name = "ip4-lookup",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+/* Global IP4 main. */
+ip4_main_t ip4_main;
+
+clib_error_t *
+ip4_lookup_init (vlib_main_t * vm)
+{
+ ip4_main_t * im = &ip4_main;
+ uword i;
+
+ for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
+ {
+ u32 m;
+
+ if (i < 32)
+ m = pow2_mask (i) << (32 - i);
+ else
+ m = ~0;
+ im->fib_masks[i] = clib_host_to_net_u32 (m);
+ }
+
+ /* Create FIB with index 0 and table id of 0. */
+ find_ip4_fib_by_table_index_or_id (im, /* table id */ 0, IP4_ROUTE_FLAG_TABLE_ID);
+
+ ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0);
+
+ {
+ pg_node_t * pn;
+ pn = pg_get_node (ip4_lookup_node.index);
+ pn->unformat_edit = unformat_pg_ip4_header;
+ }
+
+ {
+ ethernet_arp_header_t h;
+
+ memset (&h, 0, sizeof (h));
+
+ /* Set target ethernet address to all zeros. */
+ memset (h.ip4_over_ethernet[1].ethernet, 0, sizeof (h.ip4_over_ethernet[1].ethernet));
+
+#define _16(f,v) h.f = clib_host_to_net_u16 (v);
+#define _8(f,v) h.f = v;
+ _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet);
+ _16 (l3_type, ETHERNET_TYPE_IP4);
+ _8 (n_l2_address_bytes, 6);
+ _8 (n_l3_address_bytes, 4);
+ _16 (opcode, ETHERNET_ARP_OPCODE_request);
+#undef _16
+#undef _8
+
+ vlib_packet_template_init (vm,
+ &im->ip4_arp_request_packet_template,
+ /* data */ &h,
+ sizeof (h),
+ /* alloc chunk size */ 8,
+ "ip4 arp");
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip4_lookup_init);
+
+typedef struct {
+ /* Adjacency taken. */
+ u32 adj_index;
+ u32 flow_hash;
+ u32 fib_index;
+
+ /* Packet data, possibly *after* rewrite. */
+ u8 packet_data[64 - 1*sizeof(u32)];
+} ip4_forward_next_trace_t;
+
+static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *);
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+ ip_adjacency_t * adj;
+ uword indent = format_get_indent (s);
+
+ adj = ip_get_adjacency (&im->lookup_main, t->adj_index);
+ s = format (s, "fib: %d adjacency: %U flow hash: 0x%08x",
+ t->fib_index, format_ip_adjacency,
+ vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_REWRITE:
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_ip_adjacency_packet_data,
+ vnm, &im->lookup_main, t->adj_index,
+ t->packet_data, sizeof (t->packet_data));
+ break;
+
+ default:
+ break;
+ }
+
+ return s;
+}
+
+/* Common trace function for all ip4-forward next nodes. */
+void
+ip4_forward_next_trace (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ vlib_rx_or_tx_t which_adj_index)
+{
+ u32 * from, n_left;
+ ip4_main_t * im = &ip4_main;
+
+ n_left = frame->n_vectors;
+ from = vlib_frame_vector_args (frame);
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ip4_forward_next_trace_t * t0, * t1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
+ t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
+ t0->fib_index = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+ memcpy (t0->packet_data,
+ vlib_buffer_get_current (b0),
+ sizeof (t0->packet_data));
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1->adj_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
+ t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
+ t1->fib_index = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(b1)->sw_if_index[VLIB_RX]);
+ memcpy (t1->packet_data,
+ vlib_buffer_get_current (b1),
+ sizeof (t1->packet_data));
+ }
+ from += 2;
+ n_left -= 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip4_forward_next_trace_t * t0;
+
+ bi0 = from[0];
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
+ t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
+ t0->fib_index = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+ memcpy (t0->packet_data,
+ vlib_buffer_get_current (b0),
+ sizeof (t0->packet_data));
+ }
+ from += 1;
+ n_left -= 1;
+ }
+}
+
+static uword
+ip4_drop_or_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ ip4_error_t error_code)
+{
+ u32 * buffers = vlib_frame_vector_args (frame);
+ uword n_packets = frame->n_vectors;
+
+ vlib_error_drop_buffers (vm, node,
+ buffers,
+ /* stride */ 1,
+ n_packets,
+ /* next */ 0,
+ ip4_input_node.index,
+ error_code);
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip4_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ return n_packets;
+}
+
+static uword
+ip4_drop (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP); }
+
+static uword
+ip4_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); }
+
+static uword
+ip4_miss (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_DST_LOOKUP_MISS); }
+
+VLIB_REGISTER_NODE (ip4_drop_node,static) = {
+ .function = ip4_drop,
+ .name = "ip4-drop",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_punt_node,static) = {
+ .function = ip4_punt,
+ .name = "ip4-punt",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-punt",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_miss_node,static) = {
+ .function = ip4_miss,
+ .name = "ip4-miss",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+/* Compute TCP/UDP/ICMP4 checksum in software. */
+u16
+ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0,
+ ip4_header_t * ip0)
+{
+ ip_csum_t sum0;
+ u32 ip_header_length, payload_length_host_byte_order;
+ u32 n_this_buffer, n_bytes_left;
+ u16 sum16;
+ void * data_this_buffer;
+
+ /* Initialize checksum with ip header. */
+ ip_header_length = ip4_header_bytes (ip0);
+ payload_length_host_byte_order = clib_net_to_host_u16 (ip0->length) - ip_header_length;
+ sum0 = clib_host_to_net_u32 (payload_length_host_byte_order + (ip0->protocol << 16));
+
+ if (BITS (uword) == 32)
+ {
+ sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u32));
+ sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32));
+ }
+ else
+ sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64));
+
+ n_bytes_left = n_this_buffer = payload_length_host_byte_order;
+ data_this_buffer = (void *) ip0 + ip_header_length;
+ if (n_this_buffer + ip_header_length > p0->current_length)
+ n_this_buffer = p0->current_length > ip_header_length ? p0->current_length - ip_header_length : 0;
+ while (1)
+ {
+ sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
+ n_bytes_left -= n_this_buffer;
+ if (n_bytes_left == 0)
+ break;
+
+ ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT);
+ p0 = vlib_get_buffer (vm, p0->next_buffer);
+ data_this_buffer = vlib_buffer_get_current (p0);
+ n_this_buffer = p0->current_length;
+ }
+
+ sum16 = ~ ip_csum_fold (sum0);
+
+ return sum16;
+}
+
+static u32
+ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
+{
+ ip4_header_t * ip0 = vlib_buffer_get_current (p0);
+ udp_header_t * udp0;
+ u16 sum16;
+
+ ASSERT (ip0->protocol == IP_PROTOCOL_TCP
+ || ip0->protocol == IP_PROTOCOL_UDP);
+
+ udp0 = (void *) (ip0 + 1);
+ if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
+ {
+ p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
+ | IP_BUFFER_L4_CHECKSUM_CORRECT);
+ return p0->flags;
+ }
+
+ sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0);
+
+ p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
+ | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
+
+ return p0->flags;
+}
+
+static uword
+ip4_local (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_local_next_t next_index;
+ u32 * from, * to_next, n_left_from, n_left_to_next;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip4_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ udp_header_t * udp0, * udp1;
+ ip4_fib_mtrie_t * mtrie0, * mtrie1;
+ ip4_fib_mtrie_leaf_t leaf0, leaf1;
+ ip_adjacency_t * adj0, * adj1;
+ u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, adj_index0;
+ u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, adj_index1;
+ i32 len_diff0, len_diff1;
+ u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
+ u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1;
+ u8 enqueue_code;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(p0)->sw_if_index[VLIB_RX]);
+ fib_index1 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(p1)->sw_if_index[VLIB_RX]);
+
+ mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+ mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie;
+
+ leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);
+
+ proto0 = ip0->protocol;
+ proto1 = ip1->protocol;
+ is_udp0 = proto0 == IP_PROTOCOL_UDP;
+ is_udp1 = proto1 == IP_PROTOCOL_UDP;
+ is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
+ is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP;
+
+ flags0 = p0->flags;
+ flags1 = p1->flags;
+
+ good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+
+ udp0 = ip4_next_header (ip0);
+ udp1 = ip4_next_header (ip1);
+
+ /* Don't verify UDP checksum for packets with explicit zero checksum. */
+ good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
+ good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
+
+ /* Verify UDP length. */
+ ip_len0 = clib_net_to_host_u16 (ip0->length);
+ ip_len1 = clib_net_to_host_u16 (ip1->length);
+ udp_len0 = clib_net_to_host_u16 (udp0->length);
+ udp_len1 = clib_net_to_host_u16 (udp1->length);
+
+ len_diff0 = ip_len0 - udp_len0;
+ len_diff1 = ip_len1 - udp_len1;
+
+ len_diff0 = is_udp0 ? len_diff0 : 0;
+ len_diff1 = is_udp1 ? len_diff1 : 0;
+
+ if (PREDICT_FALSE (! (is_tcp_udp0 & is_tcp_udp1
+ & good_tcp_udp0 & good_tcp_udp1)))
+ {
+ if (is_tcp_udp0)
+ {
+ if (is_tcp_udp0
+ && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
+ flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
+ good_tcp_udp0 =
+ (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
+ }
+ if (is_tcp_udp1)
+ {
+ if (is_tcp_udp1
+ && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
+ flags1 = ip4_tcp_udp_validate_checksum (vm, p1);
+ good_tcp_udp1 =
+ (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ good_tcp_udp1 |= is_udp1 && udp1->checksum == 0;
+ }
+ }
+
+ good_tcp_udp0 &= len_diff0 >= 0;
+ good_tcp_udp1 &= len_diff1 >= 0;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
+
+ error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL;
+
+ error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
+ error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1;
+
+ ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
+ error0 = (is_tcp_udp0 && ! good_tcp_udp0
+ ? IP4_ERROR_TCP_CHECKSUM + is_udp0
+ : error0);
+ error1 = (is_tcp_udp1 && ! good_tcp_udp1
+ ? IP4_ERROR_TCP_CHECKSUM + is_udp1
+ : error1);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+ vnet_buffer (p1)->ip.adj_index[VLIB_RX] = adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+ vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
+ &ip0->src_address,
+ /* no_default_route */ 1));
+ ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1,
+ &ip1->src_address,
+ /* no_default_route */ 1));
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ /*
+ * Must have a route to source otherwise we drop the packet.
+ * ip4 broadcasts are accepted, e.g. to make dhcp client work
+ */
+ error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
+ && ip0->dst_address.as_u32 != 0xFFFFFFFF
+ ? IP4_ERROR_SRC_LOOKUP_MISS
+ : error0);
+ error1 = (error1 == IP4_ERROR_UNKNOWN_PROTOCOL
+ && adj1->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
+ && adj1->lookup_next_index != IP_LOOKUP_NEXT_ARP
+ && adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
+ && ip0->dst_address.as_u32 != 0xFFFFFFFF
+ ? IP4_ERROR_SRC_LOOKUP_MISS
+ : error1);
+
+ next0 = lm->local_next_by_ip_protocol[proto0];
+ next1 = lm->local_next_by_ip_protocol[proto1];
+
+ next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
+ next1 = error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
+
+ p0->error = error0 ? error_node->errors[error0] : 0;
+ p1->error = error1 ? error_node->errors[error1] : 0;
+
+ enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = pi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ if (next0 == next1)
+ {
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ next_index = next1;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ break;
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ ip4_fib_mtrie_t * mtrie0;
+ ip4_fib_mtrie_leaf_t leaf0;
+ ip_adjacency_t * adj0;
+ u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, adj_index0;
+ i32 len_diff0;
+ u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(p0)->sw_if_index[VLIB_RX]);
+
+ mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+
+ leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
+
+ proto0 = ip0->protocol;
+ is_udp0 = proto0 == IP_PROTOCOL_UDP;
+ is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP;
+
+ flags0 = p0->flags;
+
+ good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+
+ udp0 = ip4_next_header (ip0);
+
+ /* Don't verify UDP checksum for packets with explicit zero checksum. */
+ good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
+
+ /* Verify UDP length. */
+ ip_len0 = clib_net_to_host_u16 (ip0->length);
+ udp_len0 = clib_net_to_host_u16 (udp0->length);
+
+ len_diff0 = ip_len0 - udp_len0;
+
+ len_diff0 = is_udp0 ? len_diff0 : 0;
+
+ if (PREDICT_FALSE (! (is_tcp_udp0 & good_tcp_udp0)))
+ {
+ if (is_tcp_udp0)
+ {
+ if (is_tcp_udp0
+ && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))
+ flags0 = ip4_tcp_udp_validate_checksum (vm, p0);
+ good_tcp_udp0 =
+ (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ good_tcp_udp0 |= is_udp0 && udp0->checksum == 0;
+ }
+ }
+
+ good_tcp_udp0 &= len_diff0 >= 0;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
+
+ error0 = IP4_ERROR_UNKNOWN_PROTOCOL;
+
+ error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0;
+
+ ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM);
+ error0 = (is_tcp_udp0 && ! good_tcp_udp0
+ ? IP4_ERROR_TCP_CHECKSUM + is_udp0
+ : error0);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
+ &ip0->src_address,
+ /* no_default_route */ 1));
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ /* Must have a route to source otherwise we drop the packet. */
+ error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL
+ && ip0->dst_address.as_u32 != 0xFFFFFFFF
+ ? IP4_ERROR_SRC_LOOKUP_MISS
+ : error0);
+
+ next0 = lm->local_next_by_ip_protocol[proto0];
+
+ next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
+
+ p0->error = error0? error_node->errors[error0] : 0;
+
+ if (PREDICT_FALSE (next0 != next_index))
+ {
+ n_left_to_next += 1;
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ to_next[0] = pi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_local_node,static) = {
+ .function = ip4_local,
+ .name = "ip4-local",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = IP_LOCAL_N_NEXT,
+ .next_nodes = {
+ [IP_LOCAL_NEXT_DROP] = "error-drop",
+ [IP_LOCAL_NEXT_PUNT] = "error-punt",
+ // [IP_LOCAL_NEXT_TCP_LOOKUP] = "ip4-tcp-lookup",
+ [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup",
+ [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",
+ },
+};
+
+void ip4_register_protocol (u32 protocol, u32 node_index)
+{
+ vlib_main_t * vm = vlib_get_main();
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
+ lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip4_local_node.index, node_index);
+}
+
+static clib_error_t *
+show_ip_local_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ int i;
+
+ vlib_cli_output (vm, "Protocols handled by ip4_local");
+ for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++)
+ {
+ if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
+ vlib_cli_output (vm, "%d", i);
+ }
+ return 0;
+}
+
+
+
+VLIB_CLI_COMMAND (show_ip_local, static) = {
+ .path = "show ip local",
+ .function = show_ip_local_command_fn,
+ .short_help = "Show ip local protocol table",
+};
+
+static uword
+ip4_arp (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 * from, * to_next_drop;
+ uword n_left_from, n_left_to_next_drop, next_index;
+ static f64 time_last_seed_change = -1e100;
+ static u32 hash_seeds[3];
+ static uword hash_bitmap[256 / BITS (uword)];
+ f64 time_now;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip4_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ time_now = vlib_time_now (vm);
+ if (time_now - time_last_seed_change > 1e-3)
+ {
+ uword i;
+ u32 * r = clib_random_buffer_get_data (&vm->random_buffer,
+ sizeof (hash_seeds));
+ for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
+ hash_seeds[i] = r[i];
+
+ /* Mark all hash keys as been no-seen before. */
+ for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
+ hash_bitmap[i] = 0;
+
+ time_last_seed_change = time_now;
+ }
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ if (next_index == IP4_ARP_NEXT_DROP)
+ next_index = IP4_ARP_N_NEXT; /* point to first interface */
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP,
+ to_next_drop, n_left_to_next_drop);
+
+ while (n_left_from > 0 && n_left_to_next_drop > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ ethernet_header_t * eh0;
+ u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
+ uword bm0;
+ ip_adjacency_t * adj0;
+
+ pi0 = from[0];
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ ip0 = vlib_buffer_get_current (p0);
+
+ /*
+ * if ip4_rewrite_local applied the IP_LOOKUP_NEXT_ARP
+ * rewrite to this packet, we need to skip it here.
+ * Note, to distinguish from src IP addr *.8.6.*, we
+ * check for a bcast eth dest instead of IPv4 version.
+ */
+ eh0 = (ethernet_header_t*)ip0;
+ if ((ip0->ip_version_and_header_length & 0xF0) != 0x40)
+ {
+ u32 vlan_num = 0;
+ u16 * etype = &eh0->type;
+ while ((*etype == clib_host_to_net_u16 (0x8100)) //dot1q
+ || (*etype == clib_host_to_net_u16 (0x88a8)))//dot1ad
+ {
+ vlan_num += 1;
+ etype += 2; //vlan tag also 16 bits, same as etype
+ }
+ if (*etype == clib_host_to_net_u16 (0x0806)) //arp
+ {
+ vlib_buffer_advance (
+ p0, sizeof(ethernet_header_t) + (4*vlan_num));
+ ip0 = vlib_buffer_get_current (p0);
+ }
+ }
+
+ a0 = hash_seeds[0];
+ b0 = hash_seeds[1];
+ c0 = hash_seeds[2];
+
+ sw_if_index0 = adj0->rewrite_header.sw_if_index;
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ a0 ^= ip0->dst_address.data_u32;
+ b0 ^= sw_if_index0;
+
+ hash_v3_finalize32 (a0, b0, c0);
+
+ c0 &= BITS (hash_bitmap) - 1;
+ c0 = c0 / BITS (uword);
+ m0 = (uword) 1 << (c0 % BITS (uword));
+
+ bm0 = hash_bitmap[c0];
+ drop0 = (bm0 & m0) != 0;
+
+ /* Mark it as seen. */
+ hash_bitmap[c0] = bm0 | m0;
+
+ from += 1;
+ n_left_from -= 1;
+ to_next_drop[0] = pi0;
+ to_next_drop += 1;
+ n_left_to_next_drop -= 1;
+
+ p0->error = node->errors[drop0 ? IP4_ARP_ERROR_DROP : IP4_ARP_ERROR_REQUEST_SENT];
+
+ if (drop0)
+ continue;
+
+ /*
+ * Can happen if the control-plane is programming tables
+ * with traffic flowing; at least that's today's lame excuse.
+ */
+ if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP)
+ {
+ p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ];
+ }
+ else
+ /* Send ARP request. */
+ {
+ u32 bi0 = 0;
+ vlib_buffer_t * b0;
+ ethernet_arp_header_t * h0;
+ vnet_hw_interface_t * hw_if0;
+
+ h0 = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi0);
+
+ /* Add rewrite/encap string for ARP packet. */
+ vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t));
+
+ hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ /* Src ethernet address in ARP header. */
+ memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address,
+ sizeof (h0->ip4_over_ethernet[0].ethernet));
+
+ ip4_src_address_for_packet (im, p0, &h0->ip4_over_ethernet[0].ip4, sw_if_index0);
+
+ /* Copy in destination address we are requesting. */
+ h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32;
+
+ vlib_buffer_copy_trace_flag (vm, p0, bi0);
+ b0 = vlib_get_buffer (vm, bi0);
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
+
+ vlib_set_next_frame_buffer (vm, node, adj0->rewrite_header.next_index, bi0);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop);
+ }
+
+ return frame->n_vectors;
+}
+
+static char * ip4_arp_error_strings[] = {
+ [IP4_ARP_ERROR_DROP] = "address overflow drops",
+ [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent",
+ [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies",
+ [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed",
+ [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed",
+};
+
+VLIB_REGISTER_NODE (ip4_arp_node) = {
+ .function = ip4_arp,
+ .name = "ip4-arp",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_errors = ARRAY_LEN (ip4_arp_error_strings),
+ .error_strings = ip4_arp_error_strings,
+
+ .n_next_nodes = IP4_ARP_N_NEXT,
+ .next_nodes = {
+ [IP4_ARP_NEXT_DROP] = "error-drop",
+ },
+};
+
+#define foreach_notrace_ip4_arp_error \
+_(DROP) \
+_(REQUEST_SENT) \
+_(REPLICATE_DROP) \
+_(REPLICATE_FAIL)
+
+clib_error_t * arp_notrace_init (vlib_main_t * vm)
+{
+ vlib_node_runtime_t *rt =
+ vlib_node_get_runtime (vm, ip4_arp_node.index);
+
+ /* don't trace ARP request packets */
+#define _(a) \
+ vnet_pcap_drop_trace_filter_add_del \
+ (rt->errors[IP4_ARP_ERROR_##a], \
+ 1 /* is_add */);
+ foreach_notrace_ip4_arp_error;
+#undef _
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(arp_notrace_init);
+
+
+/* Send an ARP request to see if given destination is reachable on given interface. */
+clib_error_t *
+ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+ ethernet_arp_header_t * h;
+ ip4_address_t * src;
+ ip_interface_address_t * ia;
+ ip_adjacency_t * adj;
+ vnet_hw_interface_t * hi;
+ vnet_sw_interface_t * si;
+ vlib_buffer_t * b;
+ u32 bi = 0;
+
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+
+ if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ {
+ return clib_error_return (0, "%U: interface %U down",
+ format_ip4_address, dst,
+ format_vnet_sw_if_index_name, vnm,
+ sw_if_index);
+ }
+
+ src = ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia);
+ if (! src)
+ {
+ vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
+ return clib_error_return
+ (0, "no matching interface address for destination %U (interface %U)",
+ format_ip4_address, dst,
+ format_vnet_sw_if_index_name, vnm, sw_if_index);
+ }
+
+ adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
+
+ h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi);
+
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet));
+
+ h->ip4_over_ethernet[0].ip4 = src[0];
+ h->ip4_over_ethernet[1].ip4 = dst[0];
+
+ b = vlib_get_buffer (vm, bi);
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
+
+ /* Add encapsulation string for software interface (e.g. ethernet header). */
+ vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
+ vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
+
+ {
+ vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
+ u32 * to_next = vlib_frame_vector_args (f);
+ to_next[0] = bi;
+ f->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi->output_node_index, f);
+ }
+
+ return /* no error */ 0;
+}
+
+typedef enum {
+ IP4_REWRITE_NEXT_DROP,
+ IP4_REWRITE_NEXT_ARP,
+} ip4_rewrite_next_t;
+
+always_inline uword
+ip4_rewrite_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ int rewrite_for_locally_received_packets)
+{
+ ip_lookup_main_t * lm = &ip4_main.lookup_main;
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
+ vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ ip_adjacency_t * adj0, * adj1;
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ u32 pi0, rw_len0, next0, error0, checksum0, adj_index0;
+ u32 pi1, rw_len1, next1, error1, checksum1, adj_index1;
+ u32 next0_override, next1_override;
+
+ if (rewrite_for_locally_received_packets)
+ next0_override = next1_override = 0;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, STORE);
+ vlib_prefetch_buffer_header (p3, STORE);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
+ CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
+ adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx];
+
+ /* We should never rewrite a pkt using the MISS adjacency */
+ ASSERT(adj_index0 && adj_index1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ error0 = error1 = IP4_ERROR_NONE;
+
+ /* Decrement TTL & update checksum.
+ Works either endian, so no need for byte swap. */
+ if (! rewrite_for_locally_received_packets)
+ {
+ i32 ttl0 = ip0->ttl, ttl1 = ip1->ttl;
+
+ /* Input node should have reject packets with ttl 0. */
+ ASSERT (ip0->ttl > 0);
+ ASSERT (ip1->ttl > 0);
+
+ checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
+ checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100);
+
+ checksum0 += checksum0 >= 0xffff;
+ checksum1 += checksum1 >= 0xffff;
+
+ ip0->checksum = checksum0;
+ ip1->checksum = checksum1;
+
+ ttl0 -= 1;
+ ttl1 -= 1;
+
+ ip0->ttl = ttl0;
+ ip1->ttl = ttl1;
+
+ error0 = ttl0 <= 0 ? IP4_ERROR_TIME_EXPIRED : error0;
+ error1 = ttl1 <= 0 ? IP4_ERROR_TIME_EXPIRED : error1;
+
+ /* Verify checksum. */
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+ ASSERT (ip1->checksum == ip4_header_checksum (ip1));
+ }
+
+ /* Rewrite packet header and updates lengths. */
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ if (rewrite_for_locally_received_packets)
+ {
+ /*
+ * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
+ * we end up here with a local adjacency in hand
+ * The local adj rewrite data is 0xfefe on purpose.
+ * Bad engineer, no donut for you.
+ */
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_ARP))
+ next0_override = IP4_REWRITE_NEXT_ARP;
+ if (PREDICT_FALSE(adj1->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error1 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
+ if (PREDICT_FALSE(adj1->lookup_next_index
+ == IP_LOOKUP_NEXT_ARP))
+ next1_override = IP4_REWRITE_NEXT_ARP;
+ }
+
+ /* Worth pipelining. No guarantee that adj0,1 are hot... */
+ rw_len0 = adj0[0].rewrite_header.data_bytes;
+ rw_len1 = adj1[0].rewrite_header.data_bytes;
+ next0 = (error0 == IP4_ERROR_NONE)
+ ? adj0[0].rewrite_header.next_index : 0;
+
+ if (rewrite_for_locally_received_packets)
+ next0 = next0 && next0_override ? next0_override : next0;
+
+ next1 = (error1 == IP4_ERROR_NONE)
+ ? adj1[0].rewrite_header.next_index : 0;
+
+ if (rewrite_for_locally_received_packets)
+ next1 = next1 && next1_override ? next1_override : next1;
+
+ /*
+ * We've already accounted for an ethernet_header_t elsewhere
+ */
+ if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
+ vlib_increment_combined_counter
+ (&lm->adjacency_counters,
+ cpu_index, adj_index0,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len0-sizeof(ethernet_header_t));
+
+ if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t)))
+ vlib_increment_combined_counter
+ (&lm->adjacency_counters,
+ cpu_index, adj_index1,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len1-sizeof(ethernet_header_t));
+
+ /* Check MTU of outgoing interface. */
+ error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
+ ? IP4_ERROR_MTU_EXCEEDED
+ : error0);
+ error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes
+ ? IP4_ERROR_MTU_EXCEEDED
+ : error1);
+
+ p0->current_data -= rw_len0;
+ p1->current_data -= rw_len1;
+
+ p0->current_length += rw_len0;
+ p1->current_length += rw_len1;
+
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index;
+ vnet_buffer (p1)->sw_if_index[VLIB_TX] = adj1[0].rewrite_header.sw_if_index;
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+
+ /* Guess we are only writing on simple Ethernet header. */
+ vnet_rewrite_two_headers (adj0[0], adj1[0],
+ ip0, ip1,
+ sizeof (ethernet_header_t));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ ip_adjacency_t * adj0;
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ u32 pi0, rw_len0, adj_index0, next0, error0, checksum0;
+ u32 next0_override;
+
+ if (rewrite_for_locally_received_packets)
+ next0_override = 0;
+
+ pi0 = to_next[0] = from[0];
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
+
+ /* We should never rewrite a pkt using the MISS adjacency */
+ ASSERT(adj_index0);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ error0 = IP4_ERROR_NONE;
+ next0 = 0; /* drop on error */
+
+ /* Decrement TTL & update checksum. */
+ if (! rewrite_for_locally_received_packets)
+ {
+ i32 ttl0 = ip0->ttl;
+
+ checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100);
+
+ checksum0 += checksum0 >= 0xffff;
+
+ ip0->checksum = checksum0;
+
+ ASSERT (ip0->ttl > 0);
+
+ ttl0 -= 1;
+
+ ip0->ttl = ttl0;
+
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+
+ error0 = ttl0 <= 0 ? IP4_ERROR_TIME_EXPIRED : error0;
+ }
+
+ if (rewrite_for_locally_received_packets)
+ {
+ /*
+ * If someone sends e.g. an icmp4 w/ src = dst = interface addr,
+ * we end up here with a local adjacency in hand
+ * The local adj rewrite data is 0xfefe on purpose.
+ * Bad engineer, no donut for you.
+ */
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS;
+ /*
+ * We have to override the next_index in ARP adjacencies,
+ * because they're set up for ip4-arp, not this node...
+ */
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_ARP))
+ next0_override = IP4_REWRITE_NEXT_ARP;
+ }
+
+ /* Guess we are only writing on simple Ethernet header. */
+ vnet_rewrite_one_header (adj0[0], ip0,
+ sizeof (ethernet_header_t));
+
+ /* Update packet buffer attributes/set output interface. */
+ rw_len0 = adj0[0].rewrite_header.data_bytes;
+
+ if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t)))
+ vlib_increment_combined_counter
+ (&lm->adjacency_counters,
+ cpu_index, adj_index0,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len0-sizeof(ethernet_header_t));
+
+ /* Check MTU of outgoing interface. */
+ error0 = (vlib_buffer_length_in_chain (vm, p0)
+ > adj0[0].rewrite_header.max_l3_packet_bytes
+ ? IP4_ERROR_MTU_EXCEEDED
+ : error0);
+
+ p0->error = error_node->errors[error0];
+ p0->current_data -= rw_len0;
+ p0->current_length += rw_len0;
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] =
+ adj0[0].rewrite_header.sw_if_index;
+
+ next0 = (error0 == IP4_ERROR_NONE)
+ ? adj0[0].rewrite_header.next_index : 0;
+
+ if (rewrite_for_locally_received_packets)
+ next0 = next0 && next0_override ? next0_override : next0;
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Need to do trace after rewrites to pick up new packet data. */
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip4_forward_next_trace (vm, node, frame, adj_rx_tx);
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_rewrite_transit (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_rewrite_inline (vm, node, frame,
+ /* rewrite_for_locally_received_packets */ 0);
+}
+
+static uword
+ip4_rewrite_local (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_rewrite_inline (vm, node, frame,
+ /* rewrite_for_locally_received_packets */ 1);
+}
+
+VLIB_REGISTER_NODE (ip4_rewrite_node) = {
+ .function = ip4_rewrite_transit,
+ .name = "ip4-rewrite-transit",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 2,
+ .next_nodes = {
+ [IP4_REWRITE_NEXT_DROP] = "error-drop",
+ [IP4_REWRITE_NEXT_ARP] = "ip4-arp",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_rewrite_local_node,static) = {
+ .function = ip4_rewrite_local,
+ .name = "ip4-rewrite-local",
+ .vector_size = sizeof (u32),
+
+ .sibling_of = "ip4-rewrite-transit",
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 2,
+ .next_nodes = {
+ [IP4_REWRITE_NEXT_DROP] = "error-drop",
+ [IP4_REWRITE_NEXT_ARP] = "ip4-arp",
+ },
+};
+
+static clib_error_t *
+add_del_interface_table (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index, table_id;
+
+ sw_if_index = ~0;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (unformat (input, "%d", &table_id))
+ ;
+ else
+ {
+ error = clib_error_return (0, "expected table id `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ {
+ ip4_main_t * im = &ip4_main;
+ ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
+
+ if (fib)
+ {
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+ im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
+ }
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
+ .path = "set interface ip table",
+ .function = add_del_interface_table,
+ .short_help = "Add/delete FIB table id for interface",
+};
+
+
+static uword
+ip4_lookup_multicast (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+ u32 n_left_from, n_left_to_next, * from, * to_next;
+ ip_lookup_next_t next;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ u32 pi0, pi1, adj_index0, adj_index1, wrong_next;
+ ip_lookup_next_t next0, next1;
+ ip4_header_t * ip0, * ip1;
+ ip_adjacency_t * adj0, * adj1;
+ u32 fib_index0, fib_index1;
+ u32 flow_hash_config0, flow_hash_config1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+ fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
+
+ adj_index0 = ip4_fib_lookup_buffer (im, fib_index0,
+ &ip0->dst_address, p0);
+ adj_index1 = ip4_fib_lookup_buffer (im, fib_index1,
+ &ip1->dst_address, p1);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ next0 = adj0->lookup_next_index;
+ next1 = adj1->lookup_next_index;
+
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+
+ flow_hash_config1 =
+ vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config;
+
+ vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash
+ (ip0, flow_hash_config0);
+
+ vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash
+ (ip1, flow_hash_config1);
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (adj1->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ ASSERT (is_pow2 (adj1->n_adj));
+ adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+ adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+ vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+
+ if (1) /* $$$$$$ HACK FIXME */
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0));
+ if (1) /* $$$$$$ HACK FIXME */
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index1, 1,
+ vlib_buffer_length_in_chain (vm, p1));
+
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ wrong_next = (next0 != next) + 2*(next1 != next);
+ if (PREDICT_FALSE (wrong_next != 0))
+ {
+ switch (wrong_next)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = pi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ break;
+
+ case 3:
+ /* A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ if (next0 == next1)
+ {
+ /* A B B */
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next1;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ u32 pi0, adj_index0;
+ ip_lookup_next_t next0;
+ ip_adjacency_t * adj0;
+ u32 fib_index0;
+ u32 flow_hash_config0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+
+ adj_index0 = ip4_fib_lookup_buffer (im, fib_index0,
+ &ip0->dst_address, p0);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ next0 = adj0->lookup_next_index;
+
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config;
+
+ vnet_buffer (p0)->ip.flow_hash =
+ ip4_compute_flow_hash (ip0, flow_hash_config0);
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+ if (1) /* $$$$$$ HACK FIXME */
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0));
+
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ if (PREDICT_FALSE (next0 != next))
+ {
+ n_left_to_next += 1;
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next0;
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+ to_next[0] = pi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = {
+ .function = ip4_lookup_multicast,
+ .name = "ip4-lookup-multicast",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_multicast_node,static) = {
+ .function = ip4_drop,
+ .name = "ip4-multicast",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip4_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0)
+{
+ ip4_main_t * im = &ip4_main;
+ ip4_fib_mtrie_t * mtrie0;
+ ip4_fib_mtrie_leaf_t leaf0;
+ u32 adj_index0;
+
+ mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie;
+
+ leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0);
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1);
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2);
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3);
+
+ /* Handle default route. */
+ leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0);
+
+ adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+
+ return adj_index0 == ip4_fib_lookup_with_table (im, fib_index0,
+ a,
+ /* no_default_route */ 0);
+}
+
+static clib_error_t *
+test_lookup_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 table_id = 0;
+ f64 count = 1;
+ u32 n;
+ int i;
+ ip4_address_t ip4_base_address;
+ u64 errors = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table %d", &table_id))
+ ;
+ else if (unformat (input, "count %f", &count))
+ ;
+
+ else if (unformat (input, "%U",
+ unformat_ip4_address, &ip4_base_address))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ n = count;
+
+ for (i = 0; i < n; i++)
+ {
+ if (!ip4_lookup_validate (&ip4_base_address, table_id))
+ errors++;
+
+ ip4_base_address.as_u32 =
+ clib_host_to_net_u32 (1 +
+ clib_net_to_host_u32 (ip4_base_address.as_u32));
+ }
+
+ if (errors)
+ vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n);
+ else
+ vlib_cli_output (vm, "No errors in %d lookups\n", n);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (lookup_test_command, static) = {
+ .path = "test lookup",
+ .short_help = "test lookup",
+ .function = test_lookup_command_fn,
+};
+
+int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config)
+{
+ ip4_main_t * im4 = &ip4_main;
+ ip4_fib_t * fib;
+ uword * p = hash_get (im4->fib_index_by_table_id, table_id);
+
+ if (p == 0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ fib = vec_elt_at_index (im4->fibs, p[0]);
+
+ fib->flow_hash_config = flow_hash_config;
+ return 0;
+}
+
+static clib_error_t *
+set_ip_flow_hash_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int matched = 0;
+ u32 table_id = 0;
+ u32 flow_hash_config = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table %d", &table_id))
+ matched = 1;
+#define _(a,v) \
+ else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
+ foreach_flow_hash_bit
+#undef _
+ else break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "no such FIB table %d", table_id);
+
+ default:
+ clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
+ break;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) = {
+ .path = "set ip flow-hash",
+ .short_help =
+ "set ip table flow-hash table <fib-id> src dst sport dport proto reverse",
+ .function = set_ip_flow_hash_command_fn,
+};
+
+int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 table_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ ip4_main_t * ipm = &ip4_main;
+ ip_lookup_main_t * lm = &ipm->lookup_main;
+ vnet_classify_main_t * cm = &vnet_classify_main;
+
+ if (pool_is_free_index (im->sw_interfaces, sw_if_index))
+ return VNET_API_ERROR_NO_MATCHING_INTERFACE;
+
+ if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
+ lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index;
+
+ return 0;
+}
+
+static clib_error_t *
+set_ip_classify_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 table_index = ~0;
+ int table_index_set = 0;
+ u32 sw_if_index = ~0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table-index %d", &table_index))
+ table_index_set = 1;
+ else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
+ vnet_get_main(), &sw_if_index))
+ ;
+ else
+ break;
+ }
+
+ if (table_index_set == 0)
+ return clib_error_return (0, "classify table-index must be specified");
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "interface / subif must be specified");
+
+ rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_MATCHING_INTERFACE:
+ return clib_error_return (0, "No such interface");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "No such classifier table");
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip_classify_command, static) = {
+ .path = "set ip classify",
+ .short_help =
+ "set ip classify intfc <int> table-index <index>",
+ .function = set_ip_classify_command_fn,
+};
+
diff --git a/vnet/vnet/ip/ip4_hop_by_hop.c b/vnet/vnet/ip/ip4_hop_by_hop.c
new file mode 100644
index 00000000000..ee2bcc0ae75
--- /dev/null
+++ b/vnet/vnet/ip/ip4_hop_by_hop.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vppinfra/hash.h>
+#include <vppinfra/error.h>
+#include <vppinfra/elog.h>
+
+typedef struct {
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} ip4_hop_by_hop_main_t;
+
+ip4_hop_by_hop_main_t ip4_hop_by_hop_main;
+
+vlib_node_registration_t ip4_hop_by_hop_node;
+
+typedef struct {
+ u32 next_index;
+} ip4_hop_by_hop_trace_t;
+
+/* packet trace format function */
+static u8 * format_ip4_hop_by_hop_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip4_hop_by_hop_trace_t * t = va_arg (*args, ip4_hop_by_hop_trace_t *);
+
+ s = format (s, "IP4_HOP_BY_HOP: next index %d",
+ t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ip4_hop_by_hop_node;
+
+#define foreach_ip4_hop_by_hop_error \
+_(PROCESSED, "Pkts with ip4 hop-by-hop options")
+
+typedef enum {
+#define _(sym,str) IP4_HOP_BY_HOP_ERROR_##sym,
+ foreach_ip4_hop_by_hop_error
+#undef _
+ IP4_HOP_BY_HOP_N_ERROR,
+} ip4_hop_by_hop_error_t;
+
+static char * ip4_hop_by_hop_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip4_hop_by_hop_error
+#undef _
+};
+
+static uword
+ip4_hop_by_hop_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 n_left_from, * from, * to_next;
+ ip_lookup_next_t next_index;
+ u32 processed = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = IP4_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = IP4_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ Dual loop: process 2 x packets here $$$$$ */
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* $$$$$ End of processing 2 x packets $$$$$ */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip4_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip4_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 adj_index0;
+ ip_adjacency_t * adj0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ /* $$$$$$$$$$$$ process one (or more) hop-by-hop header(s) here */
+
+
+ /* $$$$$$$$$$$$ */
+
+ /* Send the packet e.g. to ip4_rewrite */
+ next0 = adj0->lookup_next_index;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip4_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ }
+
+ processed++;
+
+ /* $$$$$ Done processing 1 packet here $$$$$ */
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ip4_hop_by_hop_node.index,
+ IP4_HOP_BY_HOP_ERROR_PROCESSED, processed);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip4_hop_by_hop_node) = {
+ .function = ip4_hop_by_hop_node_fn,
+ .name = "ip4-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip4_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings),
+ .error_strings = ip4_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_add_hop_by_hop_node) = {
+ .function = ip4_hop_by_hop_node_fn,
+ .name = "ip4-add-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip4_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings),
+ .error_strings = ip4_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip4_pop_hop_by_hop_node) = {
+ .function = ip4_hop_by_hop_node_fn,
+ .name = "ip4-pop-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip4_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings),
+ .error_strings = ip4_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip4-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip4-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip4-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip4-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip4-arp",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip4-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop",
+ },
+};
+
+static clib_error_t *
+ip4_hop_by_hop_init (vlib_main_t * vm)
+{
+ ip4_hop_by_hop_main_t * hm = &ip4_hop_by_hop_main;
+
+ hm->vlib_main = vm;
+ hm->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip4_hop_by_hop_init);
diff --git a/vnet/vnet/ip/ip4_input.c b/vnet/vnet/ip/ip4_input.c
new file mode 100644
index 00000000000..68edc0fa918
--- /dev/null
+++ b/vnet/vnet/ip/ip4_input.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_input.c: IP v4 input node
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ppp/ppp.h>
+#include <vnet/hdlc/hdlc.h>
+
+typedef struct {
+ u8 packet_data[64];
+} ip4_input_trace_t;
+
+static u8 * format_ip4_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ip4_input_trace_t * t = va_arg (*va, ip4_input_trace_t *);
+
+ s = format (s, "%U",
+ format_ip4_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+typedef enum {
+ IP4_INPUT_NEXT_DROP,
+ IP4_INPUT_NEXT_PUNT,
+ IP4_INPUT_NEXT_LOOKUP,
+ IP4_INPUT_NEXT_LOOKUP_MULTICAST,
+ IP4_INPUT_NEXT_TTL_EXPIRE,
+ IP4_INPUT_N_NEXT,
+} ip4_input_next_t;
+
+/* Validate IP v4 packets and pass them either to forwarding code
+ or drop/punt exception packets. */
+always_inline uword
+ip4_input_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ int verify_checksum)
+{
+ ip4_main_t * im = &ip4_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 n_left_from, * from, * to_next;
+ ip4_input_next_t next_index;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
+ vlib_simple_counter_main_t * cm;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (ip4_input_trace_t));
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_IP4);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ ip_config_main_t * cm0, * cm1;
+ u32 sw_if_index0, pi0, ip_len0, cur_len0, next0;
+ u32 sw_if_index1, pi1, ip_len1, cur_len1, next1;
+ i32 len_diff0, len_diff1;
+ u8 error0, error1, cast0, cast1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD);
+ }
+
+ to_next[0] = pi0 = from[0];
+ to_next[1] = pi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX];
+
+ cast0 = ip4_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+ cast1 = ip4_address_is_multicast (&ip1->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+
+ cm0 = lm->rx_config_mains + cast0;
+ cm1 = lm->rx_config_mains + cast1;
+
+ vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0);
+ vnet_buffer (p1)->ip.current_config_index = vec_elt (cm1->config_index_by_sw_if_index, sw_if_index1);
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
+ vnet_buffer (p1)->ip.adj_index[VLIB_RX] = ~0;
+
+ vnet_get_config_data (&cm0->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ /* # bytes of config data */ 0);
+ vnet_get_config_data (&cm1->config_main,
+ &vnet_buffer (p1)->ip.current_config_index,
+ &next1,
+ /* # bytes of config data */ 0);
+
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+
+ error0 = error1 = IP4_ERROR_NONE;
+
+ /* Punt packets with options. */
+ error0 = (ip0->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error0;
+ error1 = (ip1->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error1;
+
+ /* Version != 4? Drop it. */
+ error0 = (ip0->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error0;
+ error1 = (ip1->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error1;
+
+ /* Verify header checksum. */
+ if (verify_checksum)
+ {
+ ip_csum_t sum0, sum1;
+
+ ip4_partial_header_checksum_x1 (ip0, sum0);
+ ip4_partial_header_checksum_x1 (ip1, sum1);
+
+ error0 = 0xffff != ip_csum_fold (sum0) ? IP4_ERROR_BAD_CHECKSUM : error0;
+ error1 = 0xffff != ip_csum_fold (sum1) ? IP4_ERROR_BAD_CHECKSUM : error1;
+ }
+
+ /* Drop fragmentation offset 1 packets. */
+ error0 = ip4_get_fragment_offset (ip0) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error0;
+ error1 = ip4_get_fragment_offset (ip1) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error1;
+
+ /* TTL <= 1? Drop it. */
+ error0 = (ip0->ttl <= 1 && cast0 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error0;
+ error1 = (ip1->ttl <= 1 && cast1 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error1;
+
+ /* Verify lengths. */
+ ip_len0 = clib_net_to_host_u16 (ip0->length);
+ ip_len1 = clib_net_to_host_u16 (ip1->length);
+
+ /* IP length must be at least minimal IP header. */
+ error0 = ip_len0 < sizeof (ip0[0]) ? IP4_ERROR_TOO_SHORT : error0;
+ error1 = ip_len1 < sizeof (ip1[0]) ? IP4_ERROR_TOO_SHORT : error1;
+
+ cur_len0 = vlib_buffer_length_in_chain (vm, p0);
+ cur_len1 = vlib_buffer_length_in_chain (vm, p1);
+
+ len_diff0 = cur_len0 - ip_len0;
+ len_diff1 = cur_len1 - ip_len1;
+
+ error0 = len_diff0 < 0 ? IP4_ERROR_BAD_LENGTH : error0;
+ error1 = len_diff1 < 0 ? IP4_ERROR_BAD_LENGTH : error1;
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+
+ if (PREDICT_FALSE(error0 != IP4_ERROR_NONE))
+ {
+ next0 = (error0 != IP4_ERROR_OPTIONS
+ ? (error0 == IP4_ERROR_TIME_EXPIRED
+ ? IP4_INPUT_NEXT_TTL_EXPIRE
+ : IP4_INPUT_NEXT_DROP)
+ : IP4_INPUT_NEXT_PUNT);
+ }
+ if (PREDICT_FALSE(error1 != IP4_ERROR_NONE))
+ {
+ next1 = (error1 != IP4_ERROR_OPTIONS
+ ? (error1 == IP4_ERROR_TIME_EXPIRED
+ ? IP4_INPUT_NEXT_TTL_EXPIRE
+ : IP4_INPUT_NEXT_DROP)
+ : IP4_INPUT_NEXT_PUNT);
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ ip_config_main_t * cm0;
+ u32 sw_if_index0, pi0, ip_len0, cur_len0, next0;
+ i32 len_diff0;
+ u8 error0, cast0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ ip0 = vlib_buffer_get_current (p0);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+
+ cast0 = ip4_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+ cm0 = lm->rx_config_mains + cast0;
+ vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0);
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
+ vnet_get_config_data (&cm0->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ /* # bytes of config data */ 0);
+
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+
+ error0 = IP4_ERROR_NONE;
+
+ /* Punt packets with options. */
+ error0 = (ip0->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error0;
+
+ /* Version != 4? Drop it. */
+ error0 = (ip0->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error0;
+
+ /* Verify header checksum. */
+ if (verify_checksum)
+ {
+ ip_csum_t sum0;
+
+ ip4_partial_header_checksum_x1 (ip0, sum0);
+ error0 = 0xffff != ip_csum_fold (sum0) ? IP4_ERROR_BAD_CHECKSUM : error0;
+ }
+
+ /* Drop fragmentation offset 1 packets. */
+ error0 = ip4_get_fragment_offset (ip0) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error0;
+
+ /* TTL <= 1? Drop it. */
+ error0 = (ip0->ttl <= 1 && cast0 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error0;
+
+ /* Verify lengths. */
+ ip_len0 = clib_net_to_host_u16 (ip0->length);
+
+ /* IP length must be at least minimal IP header. */
+ error0 = ip_len0 < sizeof (ip0[0]) ? IP4_ERROR_TOO_SHORT : error0;
+
+ cur_len0 = vlib_buffer_length_in_chain (vm, p0);
+ len_diff0 = cur_len0 - ip_len0;
+ error0 = len_diff0 < 0 ? IP4_ERROR_BAD_LENGTH : error0;
+
+ p0->error = error_node->errors[error0];
+ if (PREDICT_FALSE(error0 != IP4_ERROR_NONE))
+ {
+ next0 = (error0 != IP4_ERROR_OPTIONS
+ ? (error0 == IP4_ERROR_TIME_EXPIRED
+ ? IP4_INPUT_NEXT_TTL_EXPIRE
+ : IP4_INPUT_NEXT_DROP)
+ : IP4_INPUT_NEXT_PUNT);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_input_inline (vm, node, frame, /* verify_checksum */ 1);
+}
+
+static uword
+ip4_input_no_checksum (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_input_inline (vm, node, frame, /* verify_checksum */ 0);
+}
+
+static char * ip4_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip4_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (ip4_input_node) = {
+ .function = ip4_input,
+ .name = "ip4-input",
+ .vector_size = sizeof (u32),
+
+ .n_errors = IP4_N_ERROR,
+ .error_strings = ip4_error_strings,
+
+ .n_next_nodes = IP4_INPUT_N_NEXT,
+ .next_nodes = {
+ [IP4_INPUT_NEXT_DROP] = "error-drop",
+ [IP4_INPUT_NEXT_PUNT] = "error-punt",
+ [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
+ [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast",
+ [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire",
+ },
+
+ .format_buffer = format_ip4_header,
+ .format_trace = format_ip4_input_trace,
+};
+
+VLIB_REGISTER_NODE (ip4_input_no_checksum_node,static) = {
+ .function = ip4_input_no_checksum,
+ .name = "ip4-input-no-checksum",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP4_INPUT_N_NEXT,
+ .next_nodes = {
+ [IP4_INPUT_NEXT_DROP] = "error-drop",
+ [IP4_INPUT_NEXT_PUNT] = "error-punt",
+ [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup",
+ [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast",
+ [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire",
+ },
+
+ .format_buffer = format_ip4_header,
+ .format_trace = format_ip4_input_trace,
+};
+
+static clib_error_t * ip4_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+
+ ethernet_register_input_type (vm, ETHERNET_TYPE_IP4,
+ ip4_input_node.index);
+ ppp_register_input_protocol (vm, PPP_PROTOCOL_ip4,
+ ip4_input_node.index);
+ hdlc_register_input_protocol (vm, HDLC_PROTOCOL_ip4,
+ ip4_input_node.index);
+
+ {
+ pg_node_t * pn;
+ pn = pg_get_node (ip4_input_node.index);
+ pn->unformat_edit = unformat_pg_ip4_header;
+ pn = pg_get_node (ip4_input_no_checksum_node.index);
+ pn->unformat_edit = unformat_pg_ip4_header;
+ }
+
+ if ((error = vlib_call_init_function (vm, ip4_cli_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip4_source_check_init)))
+ return error;
+
+ /* Set flow hash to something non-zero. */
+ ip4_main.flow_hash_seed = 0xdeadbeef;
+
+ /* Default TTL for packets we generate. */
+ ip4_main.host_config.ttl = 64;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (ip4_init);
diff --git a/vnet/vnet/ip/ip4_mtrie.c b/vnet/vnet/ip/ip4_mtrie.c
new file mode 100644
index 00000000000..ed4a0d9f44f
--- /dev/null
+++ b/vnet/vnet/ip/ip4_mtrie.c
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_fib.h: ip4 mtrie fib
+ *
+ * Copyright (c) 2012 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+static void
+ply_init (ip4_fib_mtrie_ply_t * p, ip4_fib_mtrie_leaf_t init, uword prefix_len)
+{
+ p->n_non_empty_leafs = ip4_fib_mtrie_leaf_is_empty (init) ? 0 : ARRAY_LEN (p->leaves);
+ memset (p->dst_address_bits_of_leaves, prefix_len, sizeof (p->dst_address_bits_of_leaves));
+
+ /* Initialize leaves. */
+#ifdef CLIB_HAVE_VEC128
+ {
+ u32x4 * l, init_x4;
+
+#ifndef __ALTIVEC__
+ init_x4 = u32x4_splat (init);
+#else
+ {
+ u32x4_union_t y;
+ y.as_u32[0] = init;
+ y.as_u32[1] = init;
+ y.as_u32[2] = init;
+ y.as_u32[3] = init;
+ init_x4 = y.as_u32x4;
+ }
+#endif
+
+ for (l = p->leaves_as_u32x4; l < p->leaves_as_u32x4 + ARRAY_LEN (p->leaves_as_u32x4); l += 4)
+ {
+ l[0] = init_x4;
+ l[1] = init_x4;
+ l[2] = init_x4;
+ l[3] = init_x4;
+ }
+ }
+#else
+ {
+ u32 * l;
+
+ for (l = p->leaves; l < p->leaves + ARRAY_LEN (p->leaves); l += 4)
+ {
+ l[0] = init;
+ l[1] = init;
+ l[2] = init;
+ l[3] = init;
+ }
+ }
+#endif
+}
+
+static ip4_fib_mtrie_leaf_t
+ply_create (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t init_leaf, uword prefix_len)
+{
+ ip4_fib_mtrie_ply_t * p;
+
+ /* Get cache aligned ply. */
+ pool_get_aligned (m->ply_pool, p, sizeof (p[0]));
+
+ ply_init (p, init_leaf, prefix_len);
+ return ip4_fib_mtrie_leaf_set_next_ply_index (p - m->ply_pool);
+}
+
+always_inline ip4_fib_mtrie_ply_t *
+get_next_ply_for_leaf (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t l)
+{
+ uword n = ip4_fib_mtrie_leaf_get_next_ply_index (l);
+ /* It better not be the root ply. */
+ ASSERT (n != 0);
+ return pool_elt_at_index (m->ply_pool, n);
+}
+
+static void
+ply_free (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p)
+{
+ uword i, is_root;
+
+ is_root = p - m->ply_pool == 0;
+
+ for (i = 0 ; i < ARRAY_LEN (p->leaves); i++)
+ {
+ ip4_fib_mtrie_leaf_t l = p->leaves[i];
+ if (ip4_fib_mtrie_leaf_is_next_ply (l))
+ ply_free (m, get_next_ply_for_leaf (m, l));
+ }
+
+ if (is_root)
+ ply_init (p, IP4_FIB_MTRIE_LEAF_EMPTY, /* prefix_len */ 0);
+ else
+ pool_put (m->ply_pool, p);
+}
+
+void ip4_fib_free (ip4_fib_mtrie_t * m)
+{
+ ip4_fib_mtrie_ply_t * root_ply = pool_elt_at_index (m->ply_pool, 0);
+ ply_free (m, root_ply);
+}
+
+u32 ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst)
+{
+ ip4_fib_mtrie_ply_t * p = pool_elt_at_index (m->ply_pool, 0);
+ ip4_fib_mtrie_leaf_t l;
+
+ l = p->leaves[dst.as_u8[0]];
+ if (ip4_fib_mtrie_leaf_is_terminal (l))
+ return ip4_fib_mtrie_leaf_get_adj_index (l);
+
+ p = get_next_ply_for_leaf (m, l);
+ l = p->leaves[dst.as_u8[1]];
+ if (ip4_fib_mtrie_leaf_is_terminal (l))
+ return ip4_fib_mtrie_leaf_get_adj_index (l);
+
+ p = get_next_ply_for_leaf (m, l);
+ l = p->leaves[dst.as_u8[2]];
+ if (ip4_fib_mtrie_leaf_is_terminal (l))
+ return ip4_fib_mtrie_leaf_get_adj_index (l);
+
+ p = get_next_ply_for_leaf (m, l);
+ l = p->leaves[dst.as_u8[3]];
+
+ ASSERT (ip4_fib_mtrie_leaf_is_terminal (l));
+ return ip4_fib_mtrie_leaf_get_adj_index (l);
+}
+
+typedef struct {
+ ip4_address_t dst_address;
+ u32 dst_address_length;
+ u32 adj_index;
+} ip4_fib_mtrie_set_unset_leaf_args_t;
+
+static void
+set_ply_with_more_specific_leaf (ip4_fib_mtrie_t * m,
+ ip4_fib_mtrie_ply_t * ply,
+ ip4_fib_mtrie_leaf_t new_leaf,
+ uword new_leaf_dst_address_bits)
+{
+ ip4_fib_mtrie_leaf_t old_leaf;
+ uword i;
+
+ ASSERT (ip4_fib_mtrie_leaf_is_terminal (new_leaf));
+ ASSERT (! ip4_fib_mtrie_leaf_is_empty (new_leaf));
+
+ for (i = 0; i < ARRAY_LEN (ply->leaves); i++)
+ {
+ old_leaf = ply->leaves[i];
+
+ /* Recurse into sub plies. */
+ if (! ip4_fib_mtrie_leaf_is_terminal (old_leaf))
+ {
+ ip4_fib_mtrie_ply_t * sub_ply = get_next_ply_for_leaf (m, old_leaf);
+ set_ply_with_more_specific_leaf (m, sub_ply, new_leaf, new_leaf_dst_address_bits);
+ }
+
+ /* Replace less specific terminal leaves with new leaf. */
+ else if (new_leaf_dst_address_bits >= ply->dst_address_bits_of_leaves[i])
+ {
+ ply->leaves[i] = new_leaf;
+ ply->dst_address_bits_of_leaves[i] = new_leaf_dst_address_bits;
+ ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_empty (old_leaf);
+ }
+ }
+}
+
+static void
+set_leaf (ip4_fib_mtrie_t * m,
+ ip4_fib_mtrie_set_unset_leaf_args_t * a,
+ u32 old_ply_index,
+ u32 dst_address_byte_index)
+{
+ ip4_fib_mtrie_leaf_t old_leaf, new_leaf;
+ i32 n_dst_bits_next_plies;
+ u8 dst_byte;
+
+ ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32);
+ ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8));
+
+ n_dst_bits_next_plies = a->dst_address_length - BITS (u8) * (dst_address_byte_index + 1);
+
+ dst_byte = a->dst_address.as_u8[dst_address_byte_index];
+
+ /* Number of bits next plies <= 0 => insert leaves this ply. */
+ if (n_dst_bits_next_plies <= 0)
+ {
+ uword i, n_dst_bits_this_ply, old_leaf_is_terminal;
+
+ n_dst_bits_this_ply = -n_dst_bits_next_plies;
+ ASSERT ((a->dst_address.as_u8[dst_address_byte_index] & pow2_mask (n_dst_bits_this_ply)) == 0);
+
+ for (i = dst_byte; i < dst_byte + (1 << n_dst_bits_this_ply); i++)
+ {
+ ip4_fib_mtrie_ply_t * old_ply, * new_ply;
+
+ old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+
+ old_leaf = old_ply->leaves[i];
+ old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf);
+
+ /* Is leaf to be inserted more specific? */
+ if (a->dst_address_length >= old_ply->dst_address_bits_of_leaves[i])
+ {
+ new_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index);
+
+ if (old_leaf_is_terminal)
+ {
+ old_ply->dst_address_bits_of_leaves[i] = a->dst_address_length;
+ old_ply->leaves[i] = new_leaf;
+ old_ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_empty (old_leaf);
+ ASSERT (old_ply->n_non_empty_leafs <= ARRAY_LEN (old_ply->leaves));
+ }
+ else
+ {
+ /* Existing leaf points to another ply. We need to place new_leaf into all
+ more specific slots. */
+ new_ply = get_next_ply_for_leaf (m, old_leaf);
+ set_ply_with_more_specific_leaf (m, new_ply, new_leaf, a->dst_address_length);
+ }
+ }
+
+ else if (! old_leaf_is_terminal)
+ {
+ new_ply = get_next_ply_for_leaf (m, old_leaf);
+ set_leaf (m, a, new_ply - m->ply_pool, dst_address_byte_index + 1);
+ }
+ }
+ }
+ else
+ {
+ ip4_fib_mtrie_ply_t * old_ply, * new_ply;
+
+ old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+ old_leaf = old_ply->leaves[dst_byte];
+ if (ip4_fib_mtrie_leaf_is_terminal (old_leaf))
+ {
+ new_leaf = ply_create (m, old_leaf, old_ply->dst_address_bits_of_leaves[dst_byte]);
+ new_ply = get_next_ply_for_leaf (m, new_leaf);
+
+ /* Refetch since ply_create may move pool. */
+ old_ply = pool_elt_at_index (m->ply_pool, old_ply_index);
+
+ old_ply->leaves[dst_byte] = new_leaf;
+ old_ply->dst_address_bits_of_leaves[dst_byte] = 0;
+
+ old_ply->n_non_empty_leafs -= ip4_fib_mtrie_leaf_is_non_empty (old_leaf);
+ ASSERT (old_ply->n_non_empty_leafs >= 0);
+
+ /* Account for the ply we just created. */
+ old_ply->n_non_empty_leafs += 1;
+ }
+ else
+ new_ply = get_next_ply_for_leaf (m, old_leaf);
+
+ set_leaf (m, a, new_ply - m->ply_pool, dst_address_byte_index + 1);
+ }
+}
+
+static uword
+unset_leaf (ip4_fib_mtrie_t * m,
+ ip4_fib_mtrie_set_unset_leaf_args_t * a,
+ ip4_fib_mtrie_ply_t * old_ply,
+ u32 dst_address_byte_index)
+{
+ ip4_fib_mtrie_leaf_t old_leaf, del_leaf;
+ i32 n_dst_bits_next_plies;
+ uword i, n_dst_bits_this_ply, old_leaf_is_terminal;
+ u8 dst_byte;
+
+ ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32);
+ ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8));
+
+ n_dst_bits_next_plies = a->dst_address_length - BITS (u8) * (dst_address_byte_index + 1);
+
+ dst_byte = a->dst_address.as_u8[dst_address_byte_index];
+ if (n_dst_bits_next_plies < 0)
+ dst_byte &= ~pow2_mask (-n_dst_bits_next_plies);
+
+ n_dst_bits_this_ply = n_dst_bits_next_plies <= 0 ? -n_dst_bits_next_plies : 0;
+ n_dst_bits_this_ply = clib_min (8, n_dst_bits_this_ply);
+
+ del_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index);
+
+ for (i = dst_byte; i < dst_byte + (1 << n_dst_bits_this_ply); i++)
+ {
+ old_leaf = old_ply->leaves[i];
+ old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf);
+
+ if (old_leaf == del_leaf
+ || (! old_leaf_is_terminal
+ && unset_leaf (m, a, get_next_ply_for_leaf (m, old_leaf), dst_address_byte_index + 1)))
+ {
+ old_ply->leaves[i] = IP4_FIB_MTRIE_LEAF_EMPTY;
+ old_ply->dst_address_bits_of_leaves[i] = 0;
+
+ /* No matter what we just deleted a non-empty leaf. */
+ ASSERT (! ip4_fib_mtrie_leaf_is_empty (old_leaf));
+ old_ply->n_non_empty_leafs -= 1;
+
+ ASSERT (old_ply->n_non_empty_leafs >= 0);
+ if (old_ply->n_non_empty_leafs == 0 && dst_address_byte_index > 0)
+ {
+ pool_put (m->ply_pool, old_ply);
+ /* Old ply was deleted. */
+ return 1;
+ }
+ }
+ }
+
+ /* Old ply was not deleted. */
+ return 0;
+}
+
+void ip4_mtrie_init (ip4_fib_mtrie_t * m)
+{
+ ip4_fib_mtrie_leaf_t root;
+ memset (m, 0, sizeof (m[0]));
+ m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY;
+ root = ply_create (m, IP4_FIB_MTRIE_LEAF_EMPTY, /* dst_address_bits_of_leaves */ 0);
+ ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (root) == 0);
+}
+
+void
+ip4_fib_mtrie_add_del_route (ip4_fib_t * fib,
+ ip4_address_t dst_address,
+ u32 dst_address_length,
+ u32 adj_index,
+ u32 is_del)
+{
+ ip4_fib_mtrie_t * m = &fib->mtrie;
+ ip4_fib_mtrie_ply_t * root_ply;
+ ip4_fib_mtrie_set_unset_leaf_args_t a;
+ ip4_main_t * im = &ip4_main;
+
+ ASSERT(m->ply_pool != 0);
+
+ root_ply = pool_elt_at_index (m->ply_pool, 0);
+
+ /* Honor dst_address_length. Fib masks are in network byte order */
+ dst_address.as_u32 &= im->fib_masks[dst_address_length];
+ a.dst_address = dst_address;
+ a.dst_address_length = dst_address_length;
+ a.adj_index = adj_index;
+
+ if (! is_del)
+ {
+ if (dst_address_length == 0)
+ m->default_leaf = ip4_fib_mtrie_leaf_set_adj_index (adj_index);
+ else
+ set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
+ }
+ else
+ {
+ if (dst_address_length == 0)
+ m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY;
+
+ else
+ {
+ ip4_main_t * im = &ip4_main;
+ uword i;
+
+ unset_leaf (m, &a, root_ply, 0);
+
+ /* Find next less specific route and insert into mtrie. */
+ for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= 1; i--)
+ {
+ uword * p;
+ ip4_address_t key;
+
+ if (! fib->adj_index_by_dst_address[i])
+ continue;
+
+ key.as_u32 = dst_address.as_u32 & im->fib_masks[i];
+ p = hash_get (fib->adj_index_by_dst_address[i], key.as_u32);
+ if (p)
+ {
+ a.dst_address = key;
+ a.dst_address_length = i;
+ a.adj_index = p[0];
+ set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0);
+ break;
+ }
+ }
+ }
+ }
+}
+
+always_inline uword
+maybe_remap_leaf (ip_lookup_main_t * lm, ip4_fib_mtrie_leaf_t * p)
+{
+ ip4_fib_mtrie_leaf_t l = p[0];
+ uword was_remapped_to_empty_leaf = 0;
+ if (ip4_fib_mtrie_leaf_is_terminal (l))
+ {
+ u32 adj_index = ip4_fib_mtrie_leaf_get_adj_index (l);
+ u32 m = vec_elt (lm->adjacency_remap_table, adj_index);
+ if (m)
+ {
+ was_remapped_to_empty_leaf = m == ~0;
+ if (was_remapped_to_empty_leaf)
+ p[0] = (was_remapped_to_empty_leaf
+ ? IP4_FIB_MTRIE_LEAF_EMPTY
+ : ip4_fib_mtrie_leaf_set_adj_index (m - 1));
+ }
+ }
+ return was_remapped_to_empty_leaf;
+}
+
+static void maybe_remap_ply (ip_lookup_main_t * lm, ip4_fib_mtrie_ply_t * ply)
+{
+ u32 n_remapped_to_empty = 0;
+ u32 i;
+ for (i = 0; i < ARRAY_LEN (ply->leaves); i++)
+ n_remapped_to_empty += maybe_remap_leaf (lm, &ply->leaves[i]);
+ if (n_remapped_to_empty > 0)
+ {
+ ASSERT (n_remapped_to_empty <= ply->n_non_empty_leafs);
+ ply->n_non_empty_leafs -= n_remapped_to_empty;
+ if (ply->n_non_empty_leafs == 0)
+ os_panic ();
+ }
+}
+
+void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m)
+{
+ ip4_fib_mtrie_ply_t * ply;
+ pool_foreach (ply, m->ply_pool, maybe_remap_ply (lm, ply));
+ maybe_remap_leaf (lm, &m->default_leaf);
+}
+
+/* Returns number of bytes of memory used by mtrie. */
+static uword mtrie_memory_usage (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p)
+{
+ uword bytes, i;
+
+ if (! p)
+ {
+ if (pool_is_free_index (m->ply_pool, 0))
+ return 0;
+ p = pool_elt_at_index (m->ply_pool, 0);
+ }
+
+ bytes = sizeof (p[0]);
+ for (i = 0 ; i < ARRAY_LEN (p->leaves); i++)
+ {
+ ip4_fib_mtrie_leaf_t l = p->leaves[i];
+ if (ip4_fib_mtrie_leaf_is_next_ply (l))
+ bytes += mtrie_memory_usage (m, get_next_ply_for_leaf (m, l));
+ }
+
+ return bytes;
+}
+
+static u8 * format_ip4_fib_mtrie_leaf (u8 * s, va_list * va)
+{
+ ip4_fib_mtrie_leaf_t l = va_arg (*va, ip4_fib_mtrie_leaf_t);
+
+ if (ip4_fib_mtrie_leaf_is_empty (l))
+ s = format (s, "miss");
+ else if (ip4_fib_mtrie_leaf_is_terminal (l))
+ s = format (s, "adj %d", ip4_fib_mtrie_leaf_get_adj_index (l));
+ else
+ s = format (s, "next ply %d", ip4_fib_mtrie_leaf_get_next_ply_index (l));
+ return s;
+}
+
+static u8 * format_ip4_fib_mtrie_ply (u8 * s, va_list * va)
+{
+ ip4_fib_mtrie_t * m = va_arg (*va, ip4_fib_mtrie_t *);
+ u32 base_address = va_arg (*va, u32);
+ u32 ply_index = va_arg (*va, u32);
+ u32 dst_address_byte_index = va_arg (*va, u32);
+ ip4_fib_mtrie_ply_t * p;
+ uword i, indent;
+
+ p = pool_elt_at_index (m->ply_pool, ply_index);
+ indent = format_get_indent (s);
+ s = format (s, "ply index %d, %d non-empty leaves", ply_index, p->n_non_empty_leafs);
+ for (i = 0; i < ARRAY_LEN (p->leaves); i++)
+ {
+ ip4_fib_mtrie_leaf_t l = p->leaves[i];
+
+ if (! ip4_fib_mtrie_leaf_is_empty (l))
+ {
+ u32 a, ia_length;
+ ip4_address_t ia;
+
+ a = base_address + (i << (24 - 8*dst_address_byte_index));
+ ia.as_u32 = clib_host_to_net_u32 (a);
+ if (ip4_fib_mtrie_leaf_is_terminal (l))
+ ia_length = p->dst_address_bits_of_leaves[i];
+ else
+ ia_length = 8*(1 + dst_address_byte_index);
+ s = format (s, "\n%U%20U %U",
+ format_white_space, indent + 2,
+ format_ip4_address_and_length, &ia, ia_length,
+ format_ip4_fib_mtrie_leaf, l);
+
+ if (ip4_fib_mtrie_leaf_is_next_ply (l))
+ s = format (s, "\n%U%U",
+ format_white_space, indent + 2,
+ format_ip4_fib_mtrie_ply, m, a,
+ ip4_fib_mtrie_leaf_get_next_ply_index (l),
+ dst_address_byte_index + 1);
+ }
+ }
+
+ return s;
+}
+
+u8 * format_ip4_fib_mtrie (u8 * s, va_list * va)
+{
+ ip4_fib_mtrie_t * m = va_arg (*va, ip4_fib_mtrie_t *);
+
+ s = format (s, "%d plies, memory usage %U",
+ pool_elts (m->ply_pool),
+ format_memory_size, mtrie_memory_usage (m, 0));
+
+ if (pool_elts (m->ply_pool) > 0)
+ {
+ ip4_address_t base_address;
+ base_address.as_u32 = 0;
+ s = format (s, "\n %U", format_ip4_fib_mtrie_ply, m, base_address, 0, 0);
+ }
+
+ return s;
+}
diff --git a/vnet/vnet/ip/ip4_mtrie.h b/vnet/vnet/ip/ip4_mtrie.h
new file mode 100644
index 00000000000..31de41e14fa
--- /dev/null
+++ b/vnet/vnet/ip/ip4_mtrie.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_fib.h: ip4 mtrie fib
+ *
+ * Copyright (c) 2012 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_ip4_fib_h
+#define included_ip_ip4_fib_h
+
+#include <vppinfra/cache.h>
+#include <vppinfra/vector.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/ip/ip4_packet.h> /* for ip4_address_t */
+
+/* ip4 fib leafs: 4 ply 8-8-8-8 mtrie.
+ 1 + 2*adj_index for terminal leaves.
+ 0 + 2*next_ply_index for non-terminals.
+ 1 => empty (adjacency index of zero is special miss adjacency). */
+typedef u32 ip4_fib_mtrie_leaf_t;
+
+#define IP4_FIB_MTRIE_LEAF_EMPTY (1 + 2*IP_LOOKUP_MISS_ADJ_INDEX)
+#define IP4_FIB_MTRIE_LEAF_ROOT (0 + 2*0)
+
+always_inline u32 ip4_fib_mtrie_leaf_is_empty (ip4_fib_mtrie_leaf_t n)
+{ return n == IP4_FIB_MTRIE_LEAF_EMPTY; }
+
+always_inline u32 ip4_fib_mtrie_leaf_is_non_empty (ip4_fib_mtrie_leaf_t n)
+{ return n != IP4_FIB_MTRIE_LEAF_EMPTY; }
+
+always_inline u32 ip4_fib_mtrie_leaf_is_terminal (ip4_fib_mtrie_leaf_t n)
+{ return n & 1; }
+
+always_inline u32 ip4_fib_mtrie_leaf_get_adj_index (ip4_fib_mtrie_leaf_t n)
+{
+ ASSERT (ip4_fib_mtrie_leaf_is_terminal (n));
+ return n >> 1;
+}
+
+always_inline ip4_fib_mtrie_leaf_t ip4_fib_mtrie_leaf_set_adj_index (u32 adj_index)
+{
+ ip4_fib_mtrie_leaf_t l;
+ l = 1 + 2*adj_index;
+ ASSERT (ip4_fib_mtrie_leaf_get_adj_index (l) == adj_index);
+ return l;
+}
+
+always_inline u32 ip4_fib_mtrie_leaf_is_next_ply (ip4_fib_mtrie_leaf_t n)
+{ return (n & 1) == 0; }
+
+always_inline u32 ip4_fib_mtrie_leaf_get_next_ply_index (ip4_fib_mtrie_leaf_t n)
+{
+ ASSERT (ip4_fib_mtrie_leaf_is_next_ply (n));
+ return n >> 1;
+}
+
+always_inline ip4_fib_mtrie_leaf_t ip4_fib_mtrie_leaf_set_next_ply_index (u32 i)
+{
+ ip4_fib_mtrie_leaf_t l;
+ l = 0 + 2*i;
+ ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (l) == i);
+ return l;
+}
+
+/* One ply of the 4 ply mtrie fib. */
+typedef struct {
+ union {
+ ip4_fib_mtrie_leaf_t leaves[256];
+
+#ifdef CLIB_HAVE_VEC128
+ u32x4 leaves_as_u32x4[256 / 4];
+#endif
+ };
+
+ /* Prefix length for terminal leaves. */
+ u8 dst_address_bits_of_leaves[256];
+
+ /* Number of non-empty leafs (whether terminal or not). */
+ i32 n_non_empty_leafs;
+
+ /* Pad to cache line boundary. */
+ u8 pad[CLIB_CACHE_LINE_BYTES
+ - 1 * sizeof (i32)];
+} ip4_fib_mtrie_ply_t;
+
+typedef struct {
+ /* Pool of plies. Index zero is root ply. */
+ ip4_fib_mtrie_ply_t * ply_pool;
+
+ /* Special case leaf for default route 0.0.0.0/0. */
+ ip4_fib_mtrie_leaf_t default_leaf;
+} ip4_fib_mtrie_t;
+
+void ip4_fib_mtrie_init (ip4_fib_mtrie_t * m);
+
+struct ip4_fib_t;
+
+void ip4_fib_mtrie_add_del_route (struct ip4_fib_t * f,
+ ip4_address_t dst_address,
+ u32 dst_address_length,
+ u32 adj_index,
+ u32 is_del);
+
+/* Returns adjacency index. */
+u32 ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst);
+
+void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m);
+
+format_function_t format_ip4_fib_mtrie;
+
+/* Lookup step. Processes 1 byte of 4 byte ip4 address. */
+always_inline ip4_fib_mtrie_leaf_t
+ip4_fib_mtrie_lookup_step (ip4_fib_mtrie_t * m,
+ ip4_fib_mtrie_leaf_t current_leaf,
+ ip4_address_t * dst_address,
+ u32 dst_address_byte_index)
+{
+ ip4_fib_mtrie_leaf_t next_leaf;
+ ip4_fib_mtrie_ply_t * ply;
+ uword current_is_terminal = ip4_fib_mtrie_leaf_is_terminal (current_leaf);
+
+ ply = m->ply_pool + (current_is_terminal ? 0 : (current_leaf >> 1));
+ next_leaf = ply->leaves[dst_address->as_u8[dst_address_byte_index]];
+ next_leaf = current_is_terminal ? current_leaf : next_leaf;
+
+ return next_leaf;
+}
+
+#endif /* included_ip_ip4_fib_h */
diff --git a/vnet/vnet/ip/ip4_packet.h b/vnet/vnet/ip/ip4_packet.h
new file mode 100644
index 00000000000..69467eb4e03
--- /dev/null
+++ b/vnet/vnet/ip/ip4_packet.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip4/packet.h: ip4 packet format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip4_packet_h
+#define included_ip4_packet_h
+
+#include <vnet/ip/ip_packet.h> /* for ip_csum_t */
+#include <vnet/ip/tcp_packet.h> /* for tcp_header_t */
+#include <vppinfra/byte_order.h> /* for clib_net_to_host_u16 */
+
+/* IP4 address which can be accessed either as 4 bytes
+ or as a 32-bit number. */
+typedef union {
+ u8 data[4];
+ u32 data_u32;
+ /* Aliases. */
+ u8 as_u8[4];
+ u32 as_u32;
+} ip4_address_t;
+
+typedef struct {
+ /* IP address must be first for ip_interface_address_get_address() to work */
+ ip4_address_t ip4_addr;
+ u32 fib_index;
+} ip4_address_fib_t;
+
+always_inline void
+ip4_addr_fib_init (ip4_address_fib_t * addr_fib, ip4_address_t * address,
+ u32 fib_index)
+{
+ memcpy (&addr_fib->ip4_addr, address, sizeof (addr_fib->ip4_addr));
+ addr_fib->fib_index = fib_index;
+}
+
+/* (src,dst) pair of addresses as found in packet header. */
+typedef struct {
+ ip4_address_t src, dst;
+} ip4_address_pair_t;
+
+/* If address is a valid netmask, return length of mask. */
+always_inline uword
+ip4_address_netmask_length (ip4_address_t * a)
+{
+ uword result = 0;
+ uword i;
+ for (i = 0; i < ARRAY_LEN (a->as_u8); i++)
+ {
+ switch (a->as_u8[i])
+ {
+ case 0xff: result += 8; break;
+ case 0xfe: result += 7; goto done;
+ case 0xfc: result += 6; goto done;
+ case 0xf8: result += 5; goto done;
+ case 0xf0: result += 4; goto done;
+ case 0xe0: result += 3; goto done;
+ case 0xc0: result += 2; goto done;
+ case 0x80: result += 1; goto done;
+ case 0x00: result += 0; goto done;
+ default:
+ /* Not a valid netmask mask. */
+ return ~0;
+ }
+ }
+ done:
+ return result;
+}
+
+typedef union {
+ struct {
+ /* 4 bit packet length (in 32bit units) and version VVVVLLLL.
+ e.g. for packets w/ no options ip_version_and_header_length == 0x45. */
+ u8 ip_version_and_header_length;
+
+ /* Type of service. */
+ u8 tos;
+
+ /* Total layer 3 packet length including this header. */
+ u16 length;
+
+ /* Fragmentation ID. */
+ u16 fragment_id;
+
+ /* 3 bits of flags and 13 bits of fragment offset (in units
+ of 8 byte quantities). */
+ u16 flags_and_fragment_offset;
+#define IP4_HEADER_FLAG_MORE_FRAGMENTS (1 << 13)
+#define IP4_HEADER_FLAG_DONT_FRAGMENT (1 << 14)
+#define IP4_HEADER_FLAG_CONGESTION (1 << 15)
+
+ /* Time to live decremented by router at each hop. */
+ u8 ttl;
+
+ /* Next level protocol packet. */
+ u8 protocol;
+
+ /* Checksum. */
+ u16 checksum;
+
+ /* Source and destination address. */
+ union {
+ struct {
+ ip4_address_t src_address, dst_address;
+ };
+ ip4_address_pair_t address_pair;
+ };
+ };
+
+ /* For checksumming we'll want to access IP header in word sized chunks. */
+ /* For 64 bit machines. */
+ CLIB_PACKED (struct {
+ u64 checksum_data_64[2];
+ u32 checksum_data_64_32[1];
+ });
+
+ /* For 32 bit machines. */
+ CLIB_PACKED (struct {
+ u32 checksum_data_32[5];
+ });
+} ip4_header_t;
+
+/* Value of ip_version_and_header_length for packets w/o options. */
+#define IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS \
+ ((4 << 4) | (sizeof (ip4_header_t) / sizeof (u32)))
+
+always_inline int
+ip4_get_fragment_offset (ip4_header_t * i)
+{ return clib_net_to_host_u16 (i->flags_and_fragment_offset) & 0x1fff; }
+
+always_inline int
+ip4_get_fragment_more (ip4_header_t * i)
+{ return clib_net_to_host_u16 (i->flags_and_fragment_offset) & IP4_HEADER_FLAG_MORE_FRAGMENTS; }
+
+always_inline int
+ip4_is_fragment (ip4_header_t * i)
+{ return (i->flags_and_fragment_offset &
+ clib_net_to_host_u16 (0x1fff | IP4_HEADER_FLAG_MORE_FRAGMENTS)); }
+
+always_inline int
+ip4_is_first_fragment (ip4_header_t * i)
+{ return (i->flags_and_fragment_offset &
+ clib_net_to_host_u16 (0x1fff | IP4_HEADER_FLAG_MORE_FRAGMENTS)) ==
+ clib_net_to_host_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS); }
+
+/* Fragment offset in bytes. */
+always_inline int
+ip4_get_fragment_offset_bytes (ip4_header_t * i)
+{ return 8 * ip4_get_fragment_offset (i); }
+
+always_inline int
+ip4_header_bytes (ip4_header_t * i)
+{ return sizeof (u32) * (i->ip_version_and_header_length & 0xf); }
+
+always_inline void *
+ip4_next_header (ip4_header_t * i)
+{ return (void *) i + ip4_header_bytes (i); }
+
+always_inline u16
+ip4_header_checksum (ip4_header_t * i)
+{
+ u16 save, csum;
+ ip_csum_t sum;
+
+ save = i->checksum;
+ i->checksum = 0;
+ sum = ip_incremental_checksum (0, i, ip4_header_bytes (i));
+ csum = ~ip_csum_fold (sum);
+
+ i->checksum = save;
+
+ /* Make checksum agree for special case where either
+ 0 or 0xffff would give same 1s complement sum. */
+ if (csum == 0 && save == 0xffff)
+ csum = save;
+
+ return csum;
+}
+
+static inline uword
+ip4_header_checksum_is_valid (ip4_header_t * i)
+{ return i->checksum == ip4_header_checksum (i); }
+
+#define ip4_partial_header_checksum_x1(ip0,sum0) \
+do { \
+ if (BITS (ip_csum_t) > 32) \
+ { \
+ sum0 = ip0->checksum_data_64[0]; \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64[1]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64_32[0]); \
+ } \
+ else \
+ { \
+ sum0 = ip0->checksum_data_32[0]; \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[1]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[2]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[3]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[4]); \
+ } \
+} while (0)
+
+#define ip4_partial_header_checksum_x2(ip0,ip1,sum0,sum1) \
+do { \
+ if (BITS (ip_csum_t) > 32) \
+ { \
+ sum0 = ip0->checksum_data_64[0]; \
+ sum1 = ip1->checksum_data_64[0]; \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64[1]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_64[1]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64_32[0]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_64_32[0]); \
+ } \
+ else \
+ { \
+ sum0 = ip0->checksum_data_32[0]; \
+ sum1 = ip1->checksum_data_32[0]; \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[1]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[1]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[2]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[2]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[3]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[3]); \
+ sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[4]); \
+ sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[4]); \
+ } \
+} while (0)
+
+always_inline uword
+ip4_address_is_multicast (ip4_address_t * a)
+{ return (a->data[0] & 0xf0) == 0xe0; }
+
+always_inline void
+ip4_multicast_address_set_for_group (ip4_address_t * a, ip_multicast_group_t g)
+{
+ ASSERT (g < (1 << 28));
+ a->as_u32 = clib_host_to_net_u32 ((0xe << 28) + g);
+}
+
+always_inline void
+ip4_tcp_reply_x1 (ip4_header_t * ip0, tcp_header_t * tcp0)
+{
+ u32 src0, dst0;
+
+ src0 = ip0->src_address.data_u32;
+ dst0 = ip0->dst_address.data_u32;
+ ip0->src_address.data_u32 = dst0;
+ ip0->dst_address.data_u32 = src0;
+
+ src0 = tcp0->ports.src;
+ dst0 = tcp0->ports.dst;
+ tcp0->ports.src = dst0;
+ tcp0->ports.dst = src0;
+}
+
+always_inline void
+ip4_tcp_reply_x2 (ip4_header_t * ip0, ip4_header_t * ip1,
+ tcp_header_t * tcp0, tcp_header_t * tcp1)
+{
+ u32 src0, dst0, src1, dst1;
+
+ src0 = ip0->src_address.data_u32;
+ src1 = ip1->src_address.data_u32;
+ dst0 = ip0->dst_address.data_u32;
+ dst1 = ip1->dst_address.data_u32;
+ ip0->src_address.data_u32 = dst0;
+ ip1->src_address.data_u32 = dst1;
+ ip0->dst_address.data_u32 = src0;
+ ip1->dst_address.data_u32 = src1;
+
+ src0 = tcp0->ports.src;
+ src1 = tcp1->ports.src;
+ dst0 = tcp0->ports.dst;
+ dst1 = tcp1->ports.dst;
+ tcp0->ports.src = dst0;
+ tcp1->ports.src = dst1;
+ tcp0->ports.dst = src0;
+ tcp1->ports.dst = src1;
+}
+
+#endif /* included_ip4_packet_h */
diff --git a/vnet/vnet/ip/ip4_pg.c b/vnet/vnet/ip/ip4_pg.c
new file mode 100644
index 00000000000..9710d8d4c5a
--- /dev/null
+++ b/vnet/vnet/ip/ip4_pg.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_pg: IP v4 packet-generator interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+#define IP4_PG_EDIT_CHECKSUM (1 << 0)
+#define IP4_PG_EDIT_LENGTH (1 << 1)
+
+static_always_inline void
+compute_length_and_or_checksum (vlib_main_t * vm,
+ u32 * packets,
+ u32 n_packets,
+ u32 ip_header_offset,
+ u32 flags)
+{
+ ASSERT (flags != 0);
+
+ while (n_packets >= 2)
+ {
+ u32 pi0, pi1;
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ ip_csum_t sum0, sum1;
+
+ pi0 = packets[0];
+ pi1 = packets[1];
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+ n_packets -= 2;
+ packets += 2;
+
+ ip0 = (void *) (p0->data + ip_header_offset);
+ ip1 = (void *) (p1->data + ip_header_offset);
+
+ if (flags & IP4_PG_EDIT_LENGTH)
+ {
+ ip0->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset);
+ ip1->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p1) - ip_header_offset);
+ }
+
+ if (flags & IP4_PG_EDIT_CHECKSUM)
+ {
+ ASSERT (ip4_header_bytes (ip0) == sizeof (ip0[0]));
+ ASSERT (ip4_header_bytes (ip1) == sizeof (ip1[0]));
+
+ ip0->checksum = 0;
+ ip1->checksum = 0;
+
+ ip4_partial_header_checksum_x2 (ip0, ip1, sum0, sum1);
+ ip0->checksum = ~ ip_csum_fold (sum0);
+ ip1->checksum = ~ ip_csum_fold (sum1);
+
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+ ASSERT (ip1->checksum == ip4_header_checksum (ip1));
+ }
+ }
+
+ while (n_packets >= 1)
+ {
+ u32 pi0;
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ ip_csum_t sum0;
+
+ pi0 = packets[0];
+ p0 = vlib_get_buffer (vm, pi0);
+ n_packets -= 1;
+ packets += 1;
+
+ ip0 = (void *) (p0->data + ip_header_offset);
+
+ if (flags & IP4_PG_EDIT_LENGTH)
+ ip0->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset);
+
+ if (flags & IP4_PG_EDIT_CHECKSUM)
+ {
+ ASSERT (ip4_header_bytes (ip0) == sizeof (ip0[0]));
+
+ ip0->checksum = 0;
+
+ ip4_partial_header_checksum_x1 (ip0, sum0);
+ ip0->checksum = ~ ip_csum_fold (sum0);
+
+ ASSERT (ip0->checksum == ip4_header_checksum (ip0));
+ }
+ }
+}
+
+static void
+ip4_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_offset;
+
+ ip_offset = g->start_byte_offset;
+
+ switch (g->edit_function_opaque)
+ {
+ case IP4_PG_EDIT_LENGTH:
+ compute_length_and_or_checksum (vm, packets, n_packets, ip_offset,
+ IP4_PG_EDIT_LENGTH);
+ break;
+
+ case IP4_PG_EDIT_CHECKSUM:
+ compute_length_and_or_checksum (vm, packets, n_packets, ip_offset,
+ IP4_PG_EDIT_CHECKSUM);
+ break;
+
+ case IP4_PG_EDIT_LENGTH | IP4_PG_EDIT_CHECKSUM:
+ compute_length_and_or_checksum (vm, packets, n_packets, ip_offset,
+ IP4_PG_EDIT_LENGTH
+ | IP4_PG_EDIT_CHECKSUM);
+ break;
+
+ default:
+ ASSERT (0);
+ break;
+ }
+}
+
+typedef struct {
+ pg_edit_t ip_version, header_length;
+ pg_edit_t tos;
+ pg_edit_t length;
+
+ pg_edit_t fragment_id, fragment_offset;
+
+ /* Flags together with fragment offset. */
+ pg_edit_t mf_flag, df_flag, ce_flag;
+
+ pg_edit_t ttl;
+
+ pg_edit_t protocol;
+
+ pg_edit_t checksum;
+
+ pg_edit_t src_address, dst_address;
+} pg_ip4_header_t;
+
+static inline void
+pg_ip4_header_init (pg_ip4_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, ip4_header_t, f);
+ _ (tos);
+ _ (length);
+ _ (fragment_id);
+ _ (ttl);
+ _ (protocol);
+ _ (checksum);
+ _ (src_address);
+ _ (dst_address);
+#undef _
+
+ /* Initialize bit fields. */
+ pg_edit_init_bitfield (&p->header_length, ip4_header_t,
+ ip_version_and_header_length,
+ 0, 4);
+ pg_edit_init_bitfield (&p->ip_version, ip4_header_t,
+ ip_version_and_header_length,
+ 4, 4);
+
+ pg_edit_init_bitfield (&p->fragment_offset, ip4_header_t,
+ flags_and_fragment_offset,
+ 0, 13);
+ pg_edit_init_bitfield (&p->mf_flag, ip4_header_t,
+ flags_and_fragment_offset,
+ 13, 1);
+ pg_edit_init_bitfield (&p->df_flag, ip4_header_t,
+ flags_and_fragment_offset,
+ 14, 1);
+ pg_edit_init_bitfield (&p->ce_flag, ip4_header_t,
+ flags_and_fragment_offset,
+ 15, 1);
+}
+
+uword
+unformat_pg_ip4_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_ip4_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ip4_header_t),
+ &group_index);
+ pg_ip4_header_init (p);
+
+ /* Defaults. */
+ pg_edit_set_fixed (&p->ip_version, 4);
+ pg_edit_set_fixed (&p->header_length,
+ sizeof (ip4_header_t) / sizeof (u32));
+
+ pg_edit_set_fixed (&p->tos, 0);
+ pg_edit_set_fixed (&p->ttl, 64);
+
+ pg_edit_set_fixed (&p->fragment_id, 0);
+ pg_edit_set_fixed (&p->fragment_offset, 0);
+ pg_edit_set_fixed (&p->mf_flag, 0);
+ pg_edit_set_fixed (&p->df_flag, 0);
+ pg_edit_set_fixed (&p->ce_flag, 0);
+
+ p->length.type = PG_EDIT_UNSPECIFIED;
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+
+ if (unformat (input, "%U: %U -> %U",
+ unformat_pg_edit,
+ unformat_ip_protocol, &p->protocol,
+ unformat_pg_edit,
+ unformat_ip4_address, &p->src_address,
+ unformat_pg_edit,
+ unformat_ip4_address, &p->dst_address))
+ goto found;
+
+ if (! unformat (input, "%U:",
+ unformat_pg_edit,
+ unformat_ip_protocol, &p->protocol))
+ goto error;
+
+found:
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "version %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->ip_version))
+ ;
+
+ else if (unformat (input, "header-length %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->header_length))
+ ;
+
+ else if (unformat (input, "tos %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->tos))
+ ;
+
+ else if (unformat (input, "length %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->length))
+ ;
+
+ else if (unformat (input, "checksum %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->checksum))
+ ;
+
+ else if (unformat (input, "ttl %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->ttl))
+ ;
+
+ else if (unformat (input, "fragment id %U offset %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->fragment_id,
+ unformat_pg_edit,
+ unformat_pg_number, &p->fragment_offset))
+ {
+ int i;
+ for (i = 0; i< ARRAY_LEN (p->fragment_offset.values); i++)
+ pg_edit_set_value (&p->fragment_offset, i,
+ pg_edit_get_value (&p->fragment_offset, i) / 8);
+
+ }
+
+ /* Flags. */
+ else if (unformat (input, "mf") || unformat (input, "MF"))
+ pg_edit_set_fixed (&p->mf_flag, 1);
+
+ else if (unformat (input, "df") || unformat (input, "DF"))
+ pg_edit_set_fixed (&p->df_flag, 1);
+
+ else if (unformat (input, "ce") || unformat (input, "CE"))
+ pg_edit_set_fixed (&p->ce_flag, 1);
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_t protocol;
+ ip_protocol_info_t * pi;
+
+ pi = 0;
+ if (p->protocol.type == PG_EDIT_FIXED)
+ {
+ protocol = pg_edit_get_value (&p->protocol, PG_EDIT_LO);
+ pi = ip_get_protocol_info (im, protocol);
+ }
+
+ if (pi && pi->unformat_pg_edit
+ && unformat_user (input, pi->unformat_pg_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->length.type == PG_EDIT_UNSPECIFIED
+ && s->min_packet_bytes == s->max_packet_bytes
+ && group_index + 1 < vec_len (s->edit_groups))
+ {
+ pg_edit_set_fixed (&p->length,
+ pg_edit_group_n_bytes (s, group_index));
+ }
+
+ /* Compute IP header checksum if all edits are fixed. */
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ ip4_header_t fixed_header, fixed_mask, cmp_mask;
+
+ /* See if header is all fixed and specified except for
+ checksum field. */
+ memset (&cmp_mask, ~0, sizeof (cmp_mask));
+ cmp_mask.checksum = 0;
+
+ pg_edit_group_get_fixed_packet_data (s, group_index,
+ &fixed_header, &fixed_mask);
+ if (! memcmp (&fixed_mask, &cmp_mask, sizeof (cmp_mask)))
+ pg_edit_set_fixed (&p->checksum,
+ clib_net_to_host_u16 (ip4_header_checksum (&fixed_header)));
+ }
+
+ p = pg_get_edit_group (s, group_index);
+ if (p->length.type == PG_EDIT_UNSPECIFIED
+ || p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = ip4_pg_edit_function;
+ g->edit_function_opaque = 0;
+ if (p->length.type == PG_EDIT_UNSPECIFIED)
+ g->edit_function_opaque |= IP4_PG_EDIT_LENGTH;
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ g->edit_function_opaque |= IP4_PG_EDIT_CHECKSUM;
+ }
+
+ return 1;
+ }
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
diff --git a/vnet/vnet/ip/ip4_source_check.c b/vnet/vnet/ip/ip4_source_check.c
new file mode 100644
index 00000000000..47e22f2392e
--- /dev/null
+++ b/vnet/vnet/ip/ip4_source_check.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip4_source_check.c: IP v4 check source address (unicast RPF check)
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+typedef struct {
+ u8 packet_data[64];
+} ip4_source_check_trace_t;
+
+static u8 * format_ip4_source_check_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ip4_source_check_trace_t * t = va_arg (*va, ip4_source_check_trace_t *);
+
+ s = format (s, "%U",
+ format_ip4_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+typedef enum {
+ IP4_SOURCE_CHECK_NEXT_DROP,
+ IP4_SOURCE_CHECK_N_NEXT,
+} ip4_source_check_next_t;
+
+typedef enum {
+ IP4_SOURCE_CHECK_REACHABLE_VIA_RX,
+ IP4_SOURCE_CHECK_REACHABLE_VIA_ANY,
+} ip4_source_check_type_t;
+
+typedef union {
+ struct {
+ u32 no_default_route : 1;
+ u32 fib_index : 31;
+ };
+ u32 as_u32[1];
+} ip4_source_check_config_t;
+
+always_inline uword
+ip4_source_check_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ ip4_source_check_type_t source_check_type)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_config_main_t * cm = &lm->rx_config_mains[VNET_UNICAST];
+ u32 n_left_from, * from, * to_next;
+ u32 next_index;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (ip4_source_check_trace_t));
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip4_header_t * ip0, * ip1;
+ ip4_fib_mtrie_t * mtrie0, * mtrie1;
+ ip4_fib_mtrie_leaf_t leaf0, leaf1;
+ ip4_source_check_config_t * c0, * c1;
+ ip_adjacency_t * adj0, * adj1;
+ u32 pi0, next0, pass0, adj_index0;
+ u32 pi1, next1, pass1, adj_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ c0 = vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ sizeof (c0[0]));
+ c1 = vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (p1)->ip.current_config_index,
+ &next1,
+ sizeof (c1[0]));
+
+ mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie;
+ mtrie1 = &vec_elt_at_index (im->fibs, c1->fib_index)->mtrie;
+
+ leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
+ leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3);
+
+ adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+ adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1);
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index,
+ &ip0->src_address,
+ c0->no_default_route));
+ ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, c1->fib_index,
+ &ip1->src_address,
+ c1->no_default_route));
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ /* Pass multicast. */
+ pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
+ pass1 = ip4_address_is_multicast (&ip1->src_address) || ip1->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
+
+ pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
+ && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
+ || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index));
+ pass1 |= (adj1->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
+ && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
+ || vnet_buffer (p1)->sw_if_index[VLIB_RX] == adj1->rewrite_header.sw_if_index));
+
+ next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP);
+ next1 = (pass1 ? next1 : IP4_SOURCE_CHECK_NEXT_DROP);
+
+ p0->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS];
+ p1->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS];
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ ip4_fib_mtrie_t * mtrie0;
+ ip4_fib_mtrie_leaf_t leaf0;
+ ip4_source_check_config_t * c0;
+ ip_adjacency_t * adj0;
+ u32 pi0, next0, pass0, adj_index0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ ip0 = vlib_buffer_get_current (p0);
+
+ c0 = vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ sizeof (c0[0]));
+
+ mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie;
+
+ leaf0 = IP4_FIB_MTRIE_LEAF_ROOT;
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2);
+
+ leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3);
+
+ adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0);
+
+ ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index,
+ &ip0->src_address,
+ c0->no_default_route));
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ /* Pass multicast. */
+ pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF);
+
+ pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE
+ && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY
+ || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index));
+
+ next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP);
+ p0->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_source_check_reachable_via_any (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_source_check_inline (vm, node, frame, IP4_SOURCE_CHECK_REACHABLE_VIA_ANY);
+}
+
+static uword
+ip4_source_check_reachable_via_rx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip4_source_check_inline (vm, node, frame, IP4_SOURCE_CHECK_REACHABLE_VIA_RX);
+}
+
+VLIB_REGISTER_NODE (ip4_check_source_reachable_via_any) = {
+ .function = ip4_source_check_reachable_via_any,
+ .name = "ip4-source-check-via-any",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP4_SOURCE_CHECK_N_NEXT,
+ .next_nodes = {
+ [IP4_SOURCE_CHECK_NEXT_DROP] = "error-drop",
+ },
+
+ .format_buffer = format_ip4_header,
+ .format_trace = format_ip4_source_check_trace,
+};
+
+VLIB_REGISTER_NODE (ip4_check_source_reachable_via_rx) = {
+ .function = ip4_source_check_reachable_via_rx,
+ .name = "ip4-source-check-via-rx",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP4_SOURCE_CHECK_N_NEXT,
+ .next_nodes = {
+ [IP4_SOURCE_CHECK_NEXT_DROP] = "error-drop",
+ },
+
+ .format_buffer = format_ip4_header,
+ .format_trace = format_ip4_source_check_trace,
+};
+
+static clib_error_t *
+set_ip_source_check (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_config_main_t * rx_cm = &lm->rx_config_mains[VNET_UNICAST];
+ clib_error_t * error = 0;
+ u32 sw_if_index, is_del, ci;
+ ip4_source_check_config_t config;
+ ip4_rx_feature_type_t type;
+
+ sw_if_index = ~0;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ is_del = 0;
+ config.no_default_route = 0;
+ config.fib_index = im->fib_index_by_sw_if_index[sw_if_index];
+ type = IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX;
+ if (unformat (input, "del"))
+ is_del = 1;
+
+ ci = rx_cm->config_index_by_sw_if_index[sw_if_index];
+ ci = (is_del
+ ? vnet_config_del_feature
+ : vnet_config_add_feature)
+ (vm, &rx_cm->config_main,
+ ci,
+ type,
+ &config,
+ sizeof (config));
+ rx_cm->config_index_by_sw_if_index[sw_if_index] = ci;
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_source_check_command, static) = {
+ .path = "set interface ip source-check",
+ .function = set_ip_source_check,
+ .short_help = "Set IP4/IP6 interface unicast source check",
+};
+
+/* Dummy init function to get us linked in. */
+clib_error_t * ip4_source_check_init (vlib_main_t * vm)
+{ return 0; }
+
+VLIB_INIT_FUNCTION (ip4_source_check_init);
diff --git a/vnet/vnet/ip/ip4_test.c b/vnet/vnet/ip/ip4_test.c
new file mode 100644
index 00000000000..ff088e78f3e
--- /dev/null
+++ b/vnet/vnet/ip/ip4_test.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+/*
+ * ip4 FIB tester. Add, probe, delete a bunch of
+ * random routes / masks and make sure that the mtrie agrees with
+ * the hash-table FIB.
+ *
+ * Manipulate the FIB by means of the debug CLI commands, to minimize
+ * the chances of doing something idiotic.
+ */
+
+/*
+ * These routines need to be redeclared non-static elsewhere.
+ *
+ * Also: rename ip_route() -> vnet_ip_route_cmd() and add the usual
+ * test_route_init() call to main.c
+ */
+clib_error_t *
+vnet_ip_route_cmd (vlib_main_t * vm,
+ unformat_input_t * input, vlib_cli_command_t * cmd_arg);
+
+int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0);
+
+ip4_fib_t *
+find_fib_by_table_index_or_id (ip4_main_t * im, u32 table_index_or_id,
+ u32 flags);
+
+/* Routes to insert/delete/probe in FIB */
+typedef struct {
+ ip4_address_t address;
+ u32 mask_width;
+ u32 interface_id; /* not an xx_if_index */
+} test_route_t;
+
+typedef struct {
+ /* Test routes in use */
+ test_route_t *route_pool;
+
+ /* Number of fake ethernets created */
+ u32 test_interfaces_created;
+} test_main_t;
+
+test_main_t test_main;
+
+/* fake ethernet device class, distinct from "fake-ethX" */
+static u8 * format_test_interface_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "test-eth%d", dev_instance);
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (test_interface_device_class,static) = {
+ .name = "Test interface",
+ .format_device_name = format_test_interface_name,
+ .tx_function = dummy_interface_tx,
+};
+
+static clib_error_t *
+thrash (vlib_main_t * vm,
+ unformat_input_t * main_input, vlib_cli_command_t * cmd_arg)
+{
+ u32 seed = 0xdeaddabe;
+ u32 niter = 10;
+ u32 nroutes = 10;
+ u32 ninterfaces = 4;
+ f64 min_mask_bits = 7.0;
+ f64 max_mask_bits = 32.0;
+ u32 table_id = 11; /* my amp goes to 11 (use fib 11) */
+ u32 table_index;
+ int iter, i;
+ u8 * cmd;
+ test_route_t *tr;
+ test_main_t *tm = &test_main;
+ ip4_main_t * im = &ip4_main;
+ vnet_main_t * vnm = vnet_get_main();
+ unformat_input_t cmd_input;
+ f64 rf;
+ u32 *masks = 0;
+ u32 tmp;
+ u32 hw_if_index;
+ clib_error_t * error = 0;
+ uword *p;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 hw_address[6];
+ ip4_fib_t * fib;
+ int verbose = 0;
+
+ /* Precompute mask width -> mask vector */
+ tmp = (u32)~0;
+ vec_validate (masks, 32);
+ for (i = 32; i > 0; i--)
+ {
+ masks [i] = tmp;
+ tmp <<= 1;
+ }
+
+ if (unformat_user (main_input, unformat_line_input, line_input))
+ {
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "seed %d", &seed))
+ ;
+ else if (unformat (line_input, "niter %d", &niter))
+ ;
+ else if (unformat (line_input, "nroutes %d", &nroutes))
+ ;
+ else if (unformat (line_input, "ninterfaces %d", &ninterfaces))
+ ;
+ else if (unformat (line_input, "min-mask-bits %d", &tmp))
+ min_mask_bits = (f64) tmp;
+ else if (unformat (line_input, "max-mask-bits %d", &tmp))
+ max_mask_bits = (f64) tmp;
+ else if (unformat (line_input, "verbose"))
+ verbose = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, line_input);
+ }
+ }
+
+ /* Find or create FIB table 11 */
+ fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID);
+
+ for (i = tm->test_interfaces_created; i < ninterfaces; i++)
+ {
+ vnet_hw_interface_t * hw;
+ memset (hw_address, 0, sizeof (hw_address));
+ hw_address[0] = 0xd0;
+ hw_address[1] = 0x0f;
+ hw_address[5] = i;
+
+ error = ethernet_register_interface
+ (vnm,
+ test_interface_device_class.index,
+ i /* instance */,
+ hw_address,
+ &hw_if_index,
+ /* flag change */ 0);
+
+ /* Fake interfaces use FIB table 11 */
+ hw = vnet_get_hw_interface (vnm, hw_if_index);
+ vec_validate (im->fib_index_by_sw_if_index, hw->sw_if_index);
+ im->fib_index_by_sw_if_index[hw->sw_if_index] = fib->index;
+ }
+
+ tm->test_interfaces_created = ninterfaces;
+
+ /* Find fib index corresponding to FIB id 11 */
+ p = hash_get (im->fib_index_by_table_id, table_id);
+ if (p == 0)
+ {
+ vlib_cli_output (vm, "Couldn't map fib id %d to fib index\n",
+ table_id);
+ return 0;
+ }
+ table_index = p[0];
+
+ for (iter = 0; iter < niter; iter++)
+ {
+ /* Pick random routes to install */
+ for (i = 0; i < nroutes; i++)
+ {
+ int j;
+
+ pool_get (tm->route_pool, tr);
+ memset (tr, 0, sizeof (*tr));
+
+ again:
+ rf = random_f64 (&seed);
+ tr->mask_width = (u32) (min_mask_bits
+ + rf * (max_mask_bits - min_mask_bits));
+ tmp = random_u32 (&seed);
+ tmp &= masks[tr->mask_width];
+ tr->address.as_u32 = clib_host_to_net_u32(tmp);
+
+ /* We can't add the same address/mask twice... */
+ for (j = 0; j < i; j++)
+ {
+ test_route_t *prev;
+ prev = pool_elt_at_index (tm->route_pool, j);
+ if ((prev->address.as_u32 == tr->address.as_u32)
+ && (prev->mask_width == tr->mask_width))
+ goto again;
+ }
+
+ rf = random_f64 (&seed);
+ tr->interface_id = (u32) (rf * ninterfaces);
+ }
+
+ /* Add them */
+ for (i = 0; i < nroutes; i++)
+ {
+ tr = pool_elt_at_index (tm->route_pool, i);
+ cmd = format (0, "add table %d %U/%d via test-eth%d",
+ table_id,
+ format_ip4_address, &tr->address,
+ tr->mask_width, tr->interface_id);
+ vec_add1(cmd,0);
+ if (verbose)
+ fformat(stderr, "ip route %s\n", cmd);
+ unformat_init_string (&cmd_input, (char *) cmd, vec_len(cmd)-1);
+ error = vnet_ip_route_cmd (vm, &cmd_input, cmd_arg);
+ if (error)
+ clib_error_report(error);
+ unformat_free (&cmd_input);
+ vec_free(cmd);
+ }
+ /* Probe them */
+ for (i = 0; i < nroutes; i++)
+ {
+ tr = pool_elt_at_index (tm->route_pool, i);
+ if (!ip4_lookup_validate (&tr->address, table_index))
+ {
+ if (verbose)
+ fformat (stderr, "test lookup table %d %U\n",
+ table_index, format_ip4_address, &tr->address);
+
+ fformat (stderr, "FAIL-after-insert: %U/%d\n",
+ format_ip4_address, &tr->address,
+ tr->mask_width);
+ }
+ }
+
+ /* Delete them */
+ for (i = 0; i < nroutes; i++)
+ {
+ int j;
+ tr = pool_elt_at_index (tm->route_pool, i);
+ if (0)
+ cmd = format (0, "del table %d %U/%d via test-eth%d",
+ table_id,
+ format_ip4_address, &tr->address,
+ tr->mask_width, tr->interface_id);
+ else
+ cmd = format (0, "del table %d %U/%d",
+ table_id,
+ format_ip4_address, &tr->address,
+ tr->mask_width);
+ vec_add1(cmd,0);
+ if (verbose)
+ fformat(stderr, "ip route %s\n", cmd);
+ unformat_init_string (&cmd_input, (char *) cmd, vec_len(cmd)-1);
+ error = vnet_ip_route_cmd (vm, &cmd_input, cmd_arg);
+ if (error)
+ clib_error_report(error);
+ unformat_free (&cmd_input);
+ vec_free(cmd);
+
+ /* Make sure all undeleted routes still work */
+ for (j = i+1; j < nroutes; j++)
+ {
+ test_route_t *rr; /* remaining route */
+ rr = pool_elt_at_index (tm->route_pool, j);
+ if (!ip4_lookup_validate (&rr->address, table_index))
+ {
+ if (verbose)
+ fformat (stderr, "test lookup table %d %U\n",
+ table_index, format_ip4_address, &rr->address);
+
+ fformat (stderr, "FAIL: %U/%d AWOL\n",
+ format_ip4_address, &rr->address,
+ rr->mask_width);
+ fformat (stderr, " iter %d after %d of %d deletes\n",
+ iter, i, nroutes);
+ fformat (stderr, " last route deleted %U/%d\n",
+ format_ip4_address, &tr->address,
+ tr->mask_width);
+ }
+ }
+ }
+
+ pool_free (tm->route_pool);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_route_command, static) = {
+ .path = "test route",
+ .short_help = "test route",
+ .function = thrash,
+};
+
+clib_error_t *test_route_init (vlib_main_t *vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (test_route_init);
diff --git a/vnet/vnet/ip/ip6.h b/vnet/vnet/ip/ip6.h
new file mode 100644
index 00000000000..a5c322a2fa5
--- /dev/null
+++ b/vnet/vnet/ip/ip6.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6.h: ip6 main include file
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_ip6_h
+#define included_ip_ip6_h
+
+#include <vlib/mc.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vnet/ip/lookup.h>
+
+#include <vppinfra/bihash_24_8.h>
+#include <vppinfra/bihash_template.h>
+
+/*
+ * Default size of the ip6 fib hash table
+ */
+#define IP6_FIB_DEFAULT_HASH_NUM_BUCKETS (64 * 1024)
+#define IP6_FIB_DEFAULT_HASH_MEMORY_SIZE (32<<20)
+
+typedef struct {
+ ip6_address_t addr;
+ u32 dst_address_length;
+ u32 vrf_index;
+} ip6_fib_key_t;
+
+typedef struct {
+ /* Table ID (hash key) for this FIB. */
+ u32 table_id;
+
+ /* Index into FIB vector. */
+ u32 index;
+
+ /* flow hash configuration */
+ u32 flow_hash_config;
+} ip6_fib_t;
+
+struct ip6_main_t;
+
+typedef void (ip6_add_del_route_function_t)
+ (struct ip6_main_t * im,
+ uword opaque,
+ ip6_fib_t * fib,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length,
+ void * old_result,
+ void * new_result);
+
+typedef struct {
+ ip6_add_del_route_function_t * function;
+ uword required_flags;
+ uword function_opaque;
+} ip6_add_del_route_callback_t;
+
+typedef void (ip6_add_del_interface_address_function_t)
+ (struct ip6_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip6_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_del);
+
+typedef struct {
+ ip6_add_del_interface_address_function_t * function;
+ uword function_opaque;
+} ip6_add_del_interface_address_callback_t;
+
+typedef enum {
+ /* First check access list to either permit or deny this
+ packet based on classification. */
+ IP6_RX_FEATURE_CHECK_ACCESS,
+
+ /* RPF check: verify that source address is reachable via
+ RX interface or via any interface. */
+ IP6_RX_FEATURE_CHECK_SOURCE_REACHABLE_VIA_RX,
+ IP6_RX_FEATURE_CHECK_SOURCE_REACHABLE_VIA_ANY,
+
+ /* IPSec */
+ IP6_RX_FEATURE_IPSEC,
+
+ /* Intercept and decap L2TPv3 packets. */
+ IP6_RX_FEATURE_L2TPV3,
+
+ /* vPath forwarding: won't return to call next feature
+ so any feature needed before vPath forwarding must be prior
+ to this entry */
+ IP6_RX_FEATURE_VPATH,
+
+ /* Must be last: perform forwarding lookup. */
+ IP6_RX_FEATURE_LOOKUP,
+
+ IP6_N_RX_FEATURE,
+} ip6_rx_feature_type_t;
+
+typedef struct ip6_main_t {
+ BVT(clib_bihash) ip6_lookup_table;
+
+ ip_lookup_main_t lookup_main;
+
+ /* bitmap / refcounts / vector of mask widths to search */
+ uword * non_empty_dst_address_length_bitmap;
+ u8 * prefix_lengths_in_search_order;
+ i32 dst_address_length_refcounts[129];
+
+ /* Vector of FIBs. */
+ ip6_fib_t * fibs;
+
+ ip6_address_t fib_masks[129];
+
+ /* Table index indexed by software interface. */
+ u32 * fib_index_by_sw_if_index;
+
+ /* Hash table mapping table id to fib index.
+ ID space is not necessarily dense; index space is dense. */
+ uword * fib_index_by_table_id;
+
+ /* Vector of functions to call when routes are added/deleted. */
+ ip6_add_del_route_callback_t * add_del_route_callbacks;
+
+ /* Hash table mapping interface rewrite adjacency index by sw if index. */
+ uword * interface_route_adj_index_by_sw_if_index;
+
+ /* Functions to call when interface address changes. */
+ ip6_add_del_interface_address_callback_t * add_del_interface_address_callbacks;
+
+ /* Template used to generate IP6 neighbor solicitation packets. */
+ vlib_packet_template_t discover_neighbor_packet_template;
+
+ u32 * discover_neighbor_next_index_by_hw_if_index;
+
+ /* ip6 lookup table config parameters */
+ u32 lookup_table_nbuckets;
+ uword lookup_table_size;
+
+ /* Seed for Jenkins hash used to compute ip6 flow hash. */
+ u32 flow_hash_seed;
+
+ struct {
+ /* TTL to use for host generated packets. */
+ u8 ttl;
+
+ u8 pad[3];
+ } host_config;
+} ip6_main_t;
+
+/* Global ip6 main structure. */
+extern ip6_main_t ip6_main;
+
+/* Global ip6 input node. Errors get attached to ip6 input node. */
+extern vlib_node_registration_t ip6_input_node;
+extern vlib_node_registration_t ip6_rewrite_node;
+extern vlib_node_registration_t ip6_discover_neighbor_node;
+
+extern vlib_node_registration_t ip6_icmp_neighbor_discovery_event_node;
+
+/* ipv6 neighbor discovery - timer/event types */
+typedef enum {
+ ICMP6_ND_EVENT_INIT,
+} ip6_icmp_neighbor_discovery_event_type_t;
+
+typedef union {
+ u32 add_del_swindex;
+ struct {
+ u32 up_down_swindex;
+ u32 fib_index;
+ } up_down_event;
+} ip6_icmp_neighbor_discovery_event_data_t;
+
+u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst);
+u32 ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index,
+ ip6_address_t * dst);
+ip6_fib_t * find_ip6_fib_by_table_index_or_id (ip6_main_t * im,
+ u32 table_index_or_id,
+ u32 flags);
+
+always_inline uword
+ip6_destination_matches_route (ip6_main_t * im,
+ ip6_address_t * key,
+ ip6_address_t * dest,
+ uword dest_length)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (key->as_uword); i++)
+ {
+ if ((key->as_uword[i] ^ dest->as_uword[i]) & im->fib_masks[dest_length].as_uword[i])
+ return 0;
+ }
+ return 1;
+}
+
+always_inline uword
+ip6_destination_matches_interface (ip6_main_t * im,
+ ip6_address_t * key,
+ ip_interface_address_t * ia)
+{
+ ip6_address_t * a = ip_interface_address_get_address (&im->lookup_main, ia);
+ return ip6_destination_matches_route (im, key, a, ia->address_length);
+}
+
+/* As above but allows for unaligned destinations (e.g. works right from IP header of packet). */
+always_inline uword
+ip6_unaligned_destination_matches_route (ip6_main_t * im,
+ ip6_address_t * key,
+ ip6_address_t * dest,
+ uword dest_length)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (key->as_uword); i++)
+ {
+ if ((clib_mem_unaligned (&key->as_uword[i], uword) ^ dest->as_uword[i]) & im->fib_masks[dest_length].as_uword[i])
+ return 0;
+ }
+ return 1;
+}
+
+always_inline void
+ip6_src_address_for_packet (ip6_main_t * im, vlib_buffer_t * p, ip6_address_t * src, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index);
+ ip6_address_t * a = ip_interface_address_get_address (lm, ia);
+ *src = a[0];
+}
+
+always_inline u32
+ip6_src_lookup_for_packet (ip6_main_t * im, vlib_buffer_t * b, ip6_header_t * i)
+{
+ if (vnet_buffer (b)->ip.adj_index[VLIB_RX] == ~0)
+ vnet_buffer (b)->ip.adj_index[VLIB_RX]
+ = ip6_fib_lookup (im, vnet_buffer (b)->sw_if_index[VLIB_RX],
+ &i->src_address);
+ return vnet_buffer (b)->ip.adj_index[VLIB_RX];
+}
+
+/* Find interface address which matches destination. */
+always_inline ip6_address_t *
+ip6_interface_address_matching_destination (ip6_main_t * im, ip6_address_t * dst, u32 sw_if_index,
+ ip_interface_address_t ** result_ia)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia;
+ ip6_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip6_address_t * a = ip_interface_address_get_address (lm, ia);
+ if (ip6_destination_matches_route (im, dst, a, ia->address_length))
+ {
+ result = a;
+ break;
+ }
+ }));
+ if (result_ia)
+ *result_ia = result ? ia : 0;
+ return result;
+}
+
+clib_error_t *
+ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
+ ip6_address_t * address, u32 address_length,
+ u32 is_del);
+
+int ip6_address_compare (ip6_address_t * a1, ip6_address_t * a2);
+
+/* Add/del a route to the FIB. */
+
+#define IP6_ROUTE_FLAG_ADD (0 << 0)
+#define IP6_ROUTE_FLAG_DEL (1 << 0)
+#define IP6_ROUTE_FLAG_TABLE_ID (0 << 1)
+#define IP6_ROUTE_FLAG_FIB_INDEX (1 << 1)
+#define IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2)
+#define IP6_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3)
+#define IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4)
+/* Dynamic route created via neighbor discovery. */
+#define IP6_ROUTE_FLAG_NEIGHBOR (1 << 5)
+
+typedef struct {
+ /* IP6_ROUTE_FLAG_* */
+ u32 flags;
+
+ /* Either index of fib or table_id to hash and get fib.
+ IP6_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */
+ u32 table_index_or_table_id;
+
+ /* Destination address (prefix) and length. */
+ ip6_address_t dst_address;
+ u32 dst_address_length;
+
+ /* Adjacency to use for this destination. */
+ u32 adj_index;
+
+ /* If specified adjacencies to add and then
+ use for this destination. add_adj/n_add_adj
+ are override adj_index if specified. */
+ ip_adjacency_t * add_adj;
+ u32 n_add_adj;
+} ip6_add_del_route_args_t;
+
+void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * args);
+
+void ip6_add_del_route_next_hop (ip6_main_t * im,
+ u32 flags,
+ ip6_address_t * dst_address,
+ u32 dst_address_length,
+ ip6_address_t * next_hop,
+ u32 next_hop_sw_if_index,
+ u32 next_hop_weight, u32 adj_index,
+ u32 explicit_fib_index);
+u32
+ip6_get_route (ip6_main_t * im,
+ u32 fib_index_or_table_id,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length);
+
+void
+ip6_foreach_matching_route (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length,
+ ip6_address_t ** results,
+ u8 ** result_length);
+
+void ip6_delete_matching_routes (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length);
+
+void ip6_maybe_remap_adjacencies (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags);
+
+void ip6_adjacency_set_interface_route (vnet_main_t * vnm,
+ ip_adjacency_t * adj,
+ u32 sw_if_index,
+ u32 if_address_index);
+
+clib_error_t *
+ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index);
+
+clib_error_t *
+ip6_set_neighbor_limit (u32 neighbor_limit);
+
+uword
+ip6_tcp_register_listener (vlib_main_t * vm,
+ u16 dst_port,
+ u32 next_node_index);
+uword
+ip6_udp_register_listener (vlib_main_t * vm,
+ u16 dst_port,
+ u32 next_node_index);
+
+u16 ip6_tcp_udp_icmp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip6_header_t * ip0, int *bogus_lengthp);
+
+void ip6_register_protocol (u32 protocol, u32 node_index);
+
+serialize_function_t serialize_vnet_ip6_main, unserialize_vnet_ip6_main;
+
+int
+vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * a,
+ u8 * link_layer_address,
+ uword n_bytes_link_layer_address);
+int
+vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * a,
+ u8 * link_layer_address,
+ uword n_bytes_link_layer_address);
+void
+vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index);
+
+void
+ip6_link_local_address_from_ethernet_mac_address (ip6_address_t *ip,
+ u8 *mac);
+
+void
+ip6_ethernet_mac_address_from_link_local_address (u8 *mac,
+ ip6_address_t *ip);
+
+int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config);
+
+int
+ip6_neighbor_ra_config(vlib_main_t * vm, u32 sw_if_index,
+ u8 surpress, u8 managed, u8 other,
+ u8 ll_option, u8 send_unicast, u8 cease,
+ u8 use_lifetime, u32 lifetime,
+ u32 initial_count, u32 initial_interval,
+ u32 max_interval, u32 min_interval,
+ u8 is_no);
+
+int
+ip6_neighbor_ra_prefix(vlib_main_t * vm, u32 sw_if_index,
+ ip6_address_t *prefix_addr, u8 prefix_len,
+ u8 use_default, u32 val_lifetime, u32 pref_lifetime,
+ u8 no_advertise, u8 off_link, u8 no_autoconfig, u8 no_onlink,
+ u8 is_no);
+
+
+clib_error_t *
+enable_ip6_interface(vlib_main_t * vm,
+ u32 sw_if_index);
+
+clib_error_t *
+disable_ip6_interface(vlib_main_t * vm,
+ u32 sw_if_index);
+
+int
+ip6_interface_enabled(vlib_main_t * vm,
+ u32 sw_if_index);
+
+clib_error_t *
+set_ip6_link_local_address(vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t *address,
+ u8 address_length);
+
+void vnet_register_ip6_neighbor_resolution_event(vnet_main_t * vnm,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data);
+
+int vnet_set_ip6_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 table_index);
+extern vlib_node_registration_t ip6_lookup_node;
+
+/* Compute flow hash. We'll use it to select which Sponge to use for this
+ flow. And other things. */
+always_inline u32
+ip6_compute_flow_hash (ip6_header_t * ip, u32 flow_hash_config)
+{
+ tcp_header_t * tcp = (void *) (ip + 1);
+ u64 a, b, c;
+ u64 t1, t2;
+ uword is_tcp_udp = (ip->protocol == IP_PROTOCOL_TCP
+ || ip->protocol == IP_PROTOCOL_UDP);
+
+ t1 = (ip->src_address.as_u64[0] ^ ip->src_address.as_u64[1]);
+ t1 = (flow_hash_config & IP_FLOW_HASH_SRC_ADDR) ? t1 : 0;
+
+ t2 = (ip->dst_address.as_u64[0] ^ ip->dst_address.as_u64[1]);
+ t2 = (flow_hash_config & IP_FLOW_HASH_DST_ADDR) ? t2 : 0;
+
+ a = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t2 : t1;
+ b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2;
+ b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0;
+
+ t1 = is_tcp_udp ? tcp->ports.src : 0;
+ t2 = is_tcp_udp ? tcp->ports.dst : 0;
+
+ t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0;
+ t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0;
+
+ c = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ?
+ ((t1<<16) | t2) : ((t2<<16) | t1);
+
+ hash_mix64 (a, b, c);
+ return (u32) c;
+}
+
+#endif /* included_ip_ip6_h */
diff --git a/vnet/vnet/ip/ip6_error.h b/vnet/vnet/ip/ip6_error.h
new file mode 100644
index 00000000000..93754a10fcc
--- /dev/null
+++ b/vnet/vnet/ip/ip6_error.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6_error.h: ip6 fast path errors
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_ip6_error_h
+#define included_ip_ip6_error_h
+
+#define foreach_ip6_error \
+ /* Must be first. */ \
+ _ (NONE, "valid ip6 packets") \
+ \
+ /* Errors signalled by ip6-input */ \
+ _ (TOO_SHORT, "ip6 length < 40 bytes") \
+ _ (BAD_LENGTH, "ip6 length > l2 length") \
+ _ (VERSION, "ip6 version != 6") \
+ _ (TIME_EXPIRED, "ip6 ttl <= 1") \
+ \
+ /* Errors signalled by ip6-rewrite. */ \
+ _ (MTU_EXCEEDED, "ip6 MTU exceeded") \
+ _ (DST_LOOKUP_MISS, "ip6 destination lookup miss") \
+ _ (SRC_LOOKUP_MISS, "ip6 source lookup miss") \
+ _ (ADJACENCY_DROP, "ip6 adjacency drop") \
+ _ (ADJACENCY_PUNT, "ip6 adjacency punt") \
+ \
+ /* Errors signalled by ip6-local. */ \
+ _ (UNKNOWN_PROTOCOL, "unknown ip protocol") \
+ _ (UDP_CHECKSUM, "bad udp checksum") \
+ _ (TCP_CHECKSUM, "bad tcp checksum") \
+ _ (ICMP_CHECKSUM, "bad icmp checksum") \
+ _ (UDP_LENGTH, "inconsistent udp/ip lengths") \
+ \
+ /* Errors signalled by {tcp6,udp6}-lookup. */ \
+ _ (UNKNOWN_UDP_PORT, "no listener for udp port") \
+ _ (UNKNOWN_TCP_PORT, "no listener for tcp port") \
+ \
+ /* Spoofed packets in ip6-rewrite-local */ \
+ _(SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops") \
+ \
+ /* Erros singalled by ip6-inacl */ \
+ _ (INACL_TABLE_MISS, "input ACL table-miss drops") \
+ _ (INACL_SESSION_DENY, "input ACL session deny drops")
+
+typedef enum {
+#define _(sym,str) IP6_ERROR_##sym,
+ foreach_ip6_error
+#undef _
+ IP6_N_ERROR,
+} ip6_error_t;
+
+#endif /* included_ip_ip6_error_h */
diff --git a/vnet/vnet/ip/ip6_format.c b/vnet/vnet/ip/ip6_format.c
new file mode 100644
index 00000000000..1a2810e16ec
--- /dev/null
+++ b/vnet/vnet/ip/ip6_format.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6_format.c: ip6 formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+/* Format an IP6 address. */
+u8 * format_ip6_address (u8 * s, va_list * args)
+{
+ ip6_address_t * a = va_arg (*args, ip6_address_t *);
+ u32 max_zero_run = 0, this_zero_run = 0;
+ int max_zero_run_index = -1, this_zero_run_index=0;
+ int in_zero_run = 0, i;
+ int last_double_colon = 0;
+
+ /* Ugh, this is a pain. Scan forward looking for runs of 0's */
+ for (i = 0; i < ARRAY_LEN (a->as_u16); i++)
+ {
+ if (a->as_u16[i] == 0)
+ {
+ if (in_zero_run)
+ this_zero_run++;
+ else
+ {
+ in_zero_run = 1;
+ this_zero_run =1;
+ this_zero_run_index = i;
+ }
+ }
+ else
+ {
+ if (in_zero_run)
+ {
+ /* offer to compress the biggest run of > 1 zero */
+ if (this_zero_run > max_zero_run && this_zero_run > 1)
+ {
+ max_zero_run_index = this_zero_run_index;
+ max_zero_run = this_zero_run;
+ }
+ }
+ in_zero_run = 0;
+ this_zero_run = 0;
+ }
+ }
+
+ if (in_zero_run)
+ {
+ if (this_zero_run > max_zero_run && this_zero_run > 1)
+ {
+ max_zero_run_index = this_zero_run_index;
+ max_zero_run = this_zero_run;
+ }
+ }
+
+ for (i = 0; i < ARRAY_LEN (a->as_u16); i++)
+ {
+ if (i == max_zero_run_index)
+ {
+ s = format (s, "::");
+ i += max_zero_run - 1;
+ last_double_colon = 1;
+ }
+ else
+ {
+ s = format (s, "%s%x",
+ (last_double_colon || i == 0) ? "" : ":",
+ clib_net_to_host_u16 (a->as_u16[i]));
+ last_double_colon = 0;
+ }
+ }
+
+ return s;
+}
+
+/* Format an IP6 route destination and length. */
+u8 * format_ip6_address_and_length (u8 * s, va_list * args)
+{
+ ip6_address_t * a = va_arg (*args, ip6_address_t *);
+ u8 l = va_arg (*args, u32);
+ return format (s, "%U/%d", format_ip6_address, a, l);
+}
+
+/* Parse an IP6 address. */
+uword unformat_ip6_address (unformat_input_t * input, va_list * args)
+{
+ ip6_address_t * result = va_arg (*args, ip6_address_t *);
+ u16 hex_quads[8];
+ uword hex_quad, n_hex_quads, hex_digit, n_hex_digits;
+ uword c, n_colon, double_colon_index;
+
+ n_hex_quads = hex_quad = n_hex_digits = n_colon = 0;
+ double_colon_index = ARRAY_LEN (hex_quads);
+ while ((c = unformat_get_input (input)) != UNFORMAT_END_OF_INPUT)
+ {
+ hex_digit = 16;
+ if (c >= '0' && c <= '9')
+ hex_digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ hex_digit = c + 10 - 'a';
+ else if (c >= 'A' && c <= 'F')
+ hex_digit = c + 10 - 'A';
+ else if (c == ':' && n_colon < 2)
+ n_colon++;
+ else
+ {
+ unformat_put_input (input);
+ break;
+ }
+
+ /* Too many hex quads. */
+ if (n_hex_quads >= ARRAY_LEN (hex_quads))
+ return 0;
+
+ if (hex_digit < 16)
+ {
+ hex_quad = (hex_quad << 4) | hex_digit;
+
+ /* Hex quad must fit in 16 bits. */
+ if (n_hex_digits >= 4)
+ return 0;
+
+ n_colon = 0;
+ n_hex_digits++;
+ }
+
+ /* Save position of :: */
+ if (n_colon == 2)
+ {
+ /* More than one :: ? */
+ if (double_colon_index < ARRAY_LEN (hex_quads))
+ return 0;
+ double_colon_index = n_hex_quads;
+ }
+
+ if (n_colon > 0 && n_hex_digits > 0)
+ {
+ hex_quads[n_hex_quads++] = hex_quad;
+ hex_quad = 0;
+ n_hex_digits = 0;
+ }
+ }
+
+ if (n_hex_digits > 0)
+ hex_quads[n_hex_quads++] = hex_quad;
+
+ {
+ word i;
+
+ /* Expand :: to appropriate number of zero hex quads. */
+ if (double_colon_index < ARRAY_LEN (hex_quads))
+ {
+ word n_zero = ARRAY_LEN (hex_quads) - n_hex_quads;
+
+ for (i = n_hex_quads - 1; i >= (signed) double_colon_index; i--)
+ hex_quads[n_zero + i] = hex_quads[i];
+
+ for (i = 0; i < n_zero; i++)
+ {
+ ASSERT ((double_colon_index + i) < ARRAY_LEN (hex_quads));
+ hex_quads[double_colon_index + i] = 0;
+ }
+
+ n_hex_quads = ARRAY_LEN (hex_quads);
+ }
+
+ /* Too few hex quads given. */
+ if (n_hex_quads < ARRAY_LEN (hex_quads))
+ return 0;
+
+ for (i = 0; i < ARRAY_LEN (hex_quads); i++)
+ result->as_u16[i] = clib_host_to_net_u16 (hex_quads[i]);
+
+ return 1;
+ }
+}
+
+/* Format an IP6 header. */
+u8 * format_ip6_header (u8 * s, va_list * args)
+{
+ ip6_header_t * ip = va_arg (*args, ip6_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 i, ip_version, traffic_class, flow_label;
+ uword indent;
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (ip[0]))
+ return format (s, "IP header truncated");
+
+ indent = format_get_indent (s);
+ indent += 2;
+
+ s = format (s, "%U: %U -> %U",
+ format_ip_protocol, ip->protocol,
+ format_ip6_address, &ip->src_address,
+ format_ip6_address, &ip->dst_address);
+
+ i = clib_net_to_host_u32 (ip->ip_version_traffic_class_and_flow_label);
+ ip_version = (i >> 28);
+ traffic_class = (i >> 20) & 0xff;
+ flow_label = i & pow2_mask (20);
+
+ if (ip_version != 6)
+ s = format (s, "\n%Uversion %d",
+ format_white_space, indent, ip_version);
+
+ s = format (s, "\n%Utos 0x%02x, flow label 0x%x, hop limit %d, payload length %d",
+ format_white_space, indent,
+ traffic_class, flow_label, ip->hop_limit,
+ clib_net_to_host_u16 (ip->payload_length));
+
+ /* Recurse into next protocol layer. */
+ if (max_header_bytes != 0 && sizeof (ip[0]) < max_header_bytes)
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol);
+
+ if (pi && pi->format_header)
+ s = format (s, "\n%U%U",
+ format_white_space, indent - 2,
+ pi->format_header,
+ /* next protocol header */ (void*) (ip + 1),
+ max_header_bytes - sizeof (ip[0]));
+ }
+
+ return s;
+}
+
+/* Parse an IP6 header. */
+uword unformat_ip6_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ ip6_header_t * ip;
+ int old_length;
+
+ /* Allocate space for IP header. */
+ {
+ void * p;
+
+ old_length = vec_len (*result);
+ vec_add2 (*result, p, sizeof (ip[0]));
+ ip = p;
+ }
+
+ memset (ip, 0, sizeof (ip[0]));
+ ip->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (6 << 28);
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_ip_protocol, &ip->protocol,
+ unformat_ip6_address, &ip->src_address,
+ unformat_ip6_address, &ip->dst_address))
+ return 0;
+
+ /* Parse options. */
+ while (1)
+ {
+ int i;
+
+ if (unformat (input, "tos %U", unformat_vlib_number, &i))
+ ip->ip_version_traffic_class_and_flow_label |= clib_host_to_net_u32 ((i & 0xff) << 20);
+
+ else if (unformat (input, "hop-limit %U", unformat_vlib_number, &i))
+ ip->hop_limit = i;
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ /* Recurse into next protocol layer. */
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol);
+
+ if (pi && pi->unformat_header)
+ {
+ if (! unformat_user (input, pi->unformat_header, result))
+ return 0;
+
+ /* Result may have moved. */
+ ip = (void *) *result + old_length;
+ }
+ }
+
+ ip->payload_length = clib_host_to_net_u16 (vec_len (*result) - (old_length + sizeof (ip[0])));
+
+ return 1;
+}
diff --git a/vnet/vnet/ip/ip6_forward.c b/vnet/vnet/ip/ip6_forward.c
new file mode 100644
index 00000000000..f0065e969f8
--- /dev/null
+++ b/vnet/vnet/ip/ip6_forward.c
@@ -0,0 +1,2724 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6_forward.c: IP v6 forwarding
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */
+#include <vnet/srp/srp.h> /* for srp_hw_interface_class */
+#include <vppinfra/cache.h>
+
+#include <vppinfra/bihash_template.c>
+
+static void compute_prefix_lengths_in_search_order (ip6_main_t * im)
+{
+ int i;
+ vec_reset_length (im->prefix_lengths_in_search_order);
+ /* Note: bitmap reversed so this is in fact a longest prefix match */
+ clib_bitmap_foreach (i, im->non_empty_dst_address_length_bitmap,
+ ({
+ int dst_address_length = 128 - i;
+ vec_add1 (im->prefix_lengths_in_search_order, dst_address_length);
+ }));
+}
+
+u32
+ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index, ip6_address_t * dst)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ int i, len;
+ int rv;
+ BVT(clib_bihash_kv) kv, value;
+
+ len = vec_len (im->prefix_lengths_in_search_order);
+
+ for (i = 0; i < len; i++)
+ {
+ int dst_address_length = im->prefix_lengths_in_search_order[i];
+ ip6_address_t * mask = &im->fib_masks[dst_address_length];
+
+ ASSERT(dst_address_length >= 0 && dst_address_length <= 128);
+
+ kv.key[0] = dst->as_u64[0] & mask->as_u64[0];
+ kv.key[1] = dst->as_u64[1] & mask->as_u64[1];
+ kv.key[2] = ((u64)((fib_index))<<32) | dst_address_length;
+
+ rv = BV(clib_bihash_search_inline_2)(&im->ip6_lookup_table, &kv, &value);
+ if (rv == 0)
+ return value.value;
+ }
+
+ return lm->miss_adj_index;
+}
+
+u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst)
+{
+ u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
+ return ip6_fib_lookup_with_table (im, fib_index, dst);
+}
+
+void
+vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_add_del_route_args_t a;
+ ip_adjacency_t * adj;
+
+ memset(&a, 0x0, sizeof(ip6_add_del_route_args_t));
+
+ a.table_index_or_table_id = fib_index;
+ a.flags = (IP6_ROUTE_FLAG_ADD
+ | IP6_ROUTE_FLAG_FIB_INDEX
+ | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY
+ | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
+
+ /* Add ff02::1:ff00:0/104 via local route for all tables.
+ This is required for neighbor discovery to work. */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a.adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ adj->if_address_index = ~0;
+ adj->rewrite_header.data_bytes = 0;
+
+ ip6_set_solicited_node_multicast_address (&a.dst_address, 0);
+
+ a.dst_address_length = 104;
+ ip6_add_del_route (im, &a);
+
+ /* Add all-routers multicast address via local route for all tables */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a.adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ adj->if_address_index = ~0;
+ adj->rewrite_header.data_bytes = 0;
+
+ ip6_set_reserved_multicast_address (&a.dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_routers);
+
+ a.dst_address_length = 128;
+ ip6_add_del_route (im, &a);
+
+ /* Add all-nodes multicast address via local route for all tables */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a.adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ adj->if_address_index = ~0;
+ adj->rewrite_header.data_bytes = 0;
+
+ ip6_set_reserved_multicast_address (&a.dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_hosts);
+
+ a.dst_address_length = 128;
+ ip6_add_del_route (im, &a);
+
+ /* Add all-mldv2 multicast address via local route for all tables */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a.adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ adj->if_address_index = ~0;
+ adj->rewrite_header.data_bytes = 0;
+
+ ip6_set_reserved_multicast_address (&a.dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_mldv2_routers);
+
+ a.dst_address_length = 128;
+ ip6_add_del_route (im, &a);
+}
+
+static ip6_fib_t *
+create_fib_with_table_id (ip6_main_t * im, u32 table_id)
+{
+ ip6_fib_t * fib;
+ hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs));
+ vec_add2 (im->fibs, fib, 1);
+ fib->table_id = table_id;
+ fib->index = fib - im->fibs;
+ fib->flow_hash_config = IP_FLOW_HASH_DEFAULT;
+ vnet_ip6_fib_init (im, fib->index);
+ return fib;
+}
+
+ip6_fib_t *
+find_ip6_fib_by_table_index_or_id (ip6_main_t * im, u32 table_index_or_id, u32 flags)
+{
+ uword * p, fib_index;
+
+ fib_index = table_index_or_id;
+ if (! (flags & IP6_ROUTE_FLAG_FIB_INDEX))
+ {
+ p = hash_get (im->fib_index_by_table_id, table_index_or_id);
+ if (! p)
+ return create_fib_with_table_id (im, table_index_or_id);
+ fib_index = p[0];
+ }
+ return vec_elt_at_index (im->fibs, fib_index);
+}
+
+void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * a)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_fib_t * fib;
+ ip6_address_t dst_address;
+ u32 dst_address_length, adj_index;
+ uword is_del;
+ u32 old_adj_index = ~0;
+ BVT(clib_bihash_kv) kv, value;
+
+ vlib_smp_unsafe_warning();
+
+ is_del = (a->flags & IP6_ROUTE_FLAG_DEL) != 0;
+
+ /* Either create new adjacency or use given one depending on arguments. */
+ if (a->n_add_adj > 0)
+ {
+ ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index);
+ ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
+ }
+ else
+ adj_index = a->adj_index;
+
+ dst_address = a->dst_address;
+ dst_address_length = a->dst_address_length;
+ fib = find_ip6_fib_by_table_index_or_id (im, a->table_index_or_table_id,
+ a->flags);
+
+ ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
+ ip6_address_mask (&dst_address, &im->fib_masks[dst_address_length]);
+
+ /* refcount accounting */
+ if (is_del)
+ {
+ ASSERT (im->dst_address_length_refcounts[dst_address_length] > 0);
+ if (--im->dst_address_length_refcounts[dst_address_length] == 0)
+ {
+ im->non_empty_dst_address_length_bitmap =
+ clib_bitmap_set (im->non_empty_dst_address_length_bitmap,
+ 128 - dst_address_length, 0);
+ compute_prefix_lengths_in_search_order (im);
+ }
+ }
+ else
+ {
+ im->dst_address_length_refcounts[dst_address_length]++;
+
+ im->non_empty_dst_address_length_bitmap =
+ clib_bitmap_set (im->non_empty_dst_address_length_bitmap,
+ 128 - dst_address_length, 1);
+ compute_prefix_lengths_in_search_order (im);
+ }
+
+ kv.key[0] = dst_address.as_u64[0];
+ kv.key[1] = dst_address.as_u64[1];
+ kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length;
+
+ if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0)
+ old_adj_index = value.value;
+
+ if (is_del)
+ BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 0 /* is_add */);
+ else
+ {
+ /* Make sure adj index is valid. */
+ if (CLIB_DEBUG > 0)
+ (void) ip_get_adjacency (lm, adj_index);
+
+ kv.value = adj_index;
+
+ BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 1 /* is_add */);
+ }
+
+ /* Delete old adjacency index if present and changed. */
+ {
+ if (! (a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY)
+ && old_adj_index != ~0
+ && old_adj_index != adj_index)
+ ip_del_adjacency (lm, old_adj_index);
+ }
+}
+
+void
+ip6_add_del_route_next_hop (ip6_main_t * im,
+ u32 flags,
+ ip6_address_t * dst_address,
+ u32 dst_address_length,
+ ip6_address_t * next_hop,
+ u32 next_hop_sw_if_index,
+ u32 next_hop_weight, u32 adj_index,
+ u32 explicit_fib_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 fib_index;
+ ip6_fib_t * fib;
+ ip6_address_t masked_dst_address;
+ u32 old_mp_adj_index, new_mp_adj_index;
+ u32 dst_adj_index, nh_adj_index;
+ int rv;
+ ip_adjacency_t * dst_adj;
+ ip_multipath_adjacency_t * old_mp, * new_mp;
+ int is_del = (flags & IP6_ROUTE_FLAG_DEL) != 0;
+ int is_interface_next_hop;
+ clib_error_t * error = 0;
+ uword * nh_result;
+ BVT(clib_bihash_kv) kv, value;
+
+ vlib_smp_unsafe_warning();
+
+ if (explicit_fib_index == (u32)~0)
+ fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index);
+ else
+ fib_index = explicit_fib_index;
+
+ fib = vec_elt_at_index (im->fibs, fib_index);
+
+ /* Lookup next hop to be added or deleted. */
+ is_interface_next_hop = ip6_address_is_zero (next_hop);
+ if (adj_index == (u32)~0)
+ {
+ if (is_interface_next_hop)
+ {
+ nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index,
+ next_hop_sw_if_index);
+ if (nh_result)
+ nh_adj_index = *nh_result;
+ else
+ {
+ ip_adjacency_t * adj;
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &nh_adj_index);
+ ip6_adjacency_set_interface_route (vnm, adj,
+ next_hop_sw_if_index, ~0);
+ ip_call_add_del_adjacency_callbacks
+ (lm, next_hop_sw_if_index, /* is_del */ 0);
+ hash_set (im->interface_route_adj_index_by_sw_if_index,
+ next_hop_sw_if_index, nh_adj_index);
+ }
+ }
+ else
+ {
+ /* Look for the interface /128 route */
+ kv.key[0] = next_hop->as_u64[0];
+ kv.key[1] = next_hop->as_u64[1];
+ kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128;
+
+ if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0)
+ {
+ vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
+ error = clib_error_return (0, "next-hop %U/128 not in FIB",
+ format_ip6_address, next_hop);
+ goto done;
+ }
+
+ nh_adj_index = value.value;
+ }
+ }
+ else
+ {
+ /* Look for the interface /128 route */
+ kv.key[0] = next_hop->as_u64[0];
+ kv.key[1] = next_hop->as_u64[1];
+ kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128;
+
+ if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0)
+ {
+ vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
+ error = clib_error_return (0, "next-hop %U/128 not in FIB",
+ format_ip6_address, next_hop);
+ goto done;
+ }
+
+ nh_adj_index = value.value;
+ }
+
+ ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks));
+ masked_dst_address = dst_address[0];
+ ip6_address_mask (&masked_dst_address, &im->fib_masks[dst_address_length]);
+
+ kv.key[0] = masked_dst_address.as_u64[0];
+ kv.key[1] = masked_dst_address.as_u64[1];
+ kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length;
+
+ rv = BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value);
+
+ if (rv == 0)
+ {
+ dst_adj_index = value.value;
+ dst_adj = ip_get_adjacency (lm, dst_adj_index);
+ }
+ else
+ {
+ /* For deletes destination must be known. */
+ if (is_del)
+ {
+ vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION;
+ error = clib_error_return (0, "unknown destination %U/%d",
+ format_ip6_address, dst_address,
+ dst_address_length);
+ goto done;
+ }
+
+ dst_adj_index = ~0;
+ dst_adj = 0;
+ }
+
+ /* Ignore adds of X/128 with next hop of X. */
+ if (! is_del
+ && dst_address_length == 128
+ && ip6_address_is_equal (dst_address, next_hop))
+ {
+ vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP;
+ error = clib_error_return (0, "prefix matches next hop %U/%d",
+ format_ip6_address, dst_address,
+ dst_address_length);
+ goto done;
+ }
+
+ old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0;
+
+ if (! ip_multipath_adjacency_add_del_next_hop
+ (lm, is_del,
+ dst_adj ? dst_adj->heap_handle : ~0,
+ nh_adj_index,
+ next_hop_weight,
+ &new_mp_adj_index))
+ {
+ vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP;
+ error = clib_error_return
+ (0, "requested deleting next-hop %U not found in multi-path",
+ format_ip6_address, next_hop);
+ goto done;
+ }
+
+ old_mp = new_mp = 0;
+ if (old_mp_adj_index != ~0)
+ old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
+ if (new_mp_adj_index != ~0)
+ new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index);
+
+ if (old_mp != new_mp)
+ {
+ ip6_add_del_route_args_t a;
+ a.table_index_or_table_id = fib_index;
+ a.flags = ((is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD)
+ | IP6_ROUTE_FLAG_FIB_INDEX
+ | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY
+ | (flags & IP6_ROUTE_FLAG_NO_REDISTRIBUTE));
+ a.dst_address = dst_address[0];
+ a.dst_address_length = dst_address_length;
+ a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index;
+ a.add_adj = 0;
+ a.n_add_adj = 0;
+
+ ip6_add_del_route (im, &a);
+ }
+
+ done:
+ if (error)
+ clib_error_report (error);
+}
+
+u32
+ip6_get_route (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length)
+{
+ ip6_fib_t * fib = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+ ip6_address_t masked_address;
+ BVT(clib_bihash_kv) kv, value;
+
+ ASSERT (address_length < ARRAY_LEN (im->fib_masks));
+ memcpy (&masked_address, address, sizeof (masked_address));
+ ip6_address_mask (&masked_address, &im->fib_masks[address_length]);
+
+ kv.key[0] = masked_address.as_u64[0];
+ kv.key[1] = masked_address.as_u64[1];
+ kv.key[2] = ((u64)((fib - im->fibs))<<32) | address_length;
+
+ if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0)
+ return (value.value);
+ return 0;
+}
+
+void
+ip6_foreach_matching_route (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip6_address_t * dst_address,
+ u32 address_length,
+ ip6_address_t ** results,
+ u8 ** result_lengths)
+{
+ ip6_fib_t * fib =
+ find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+ BVT(clib_bihash) * h = &im->ip6_lookup_table;
+ BVT(clib_bihash_value) * v;
+ clib_bihash_bucket_t * b;
+ int i, j, k;
+
+ if (*results)
+ _vec_len (*results) = 0;
+ if (*result_lengths)
+ _vec_len (*result_lengths) = 0;
+
+ /* Walk the table looking for routes which match the supplied address */
+ for (i = 0; i < h->nbuckets; i++)
+ {
+ b = &h->buckets [i];
+ if (b->offset == 0)
+ continue;
+
+ v = BV(clib_bihash_get_value) (h, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
+ {
+ if (BV(clib_bihash_is_free)(&v->kvp[k]))
+ continue;
+
+ if ((v->kvp[k].key[2]
+ == (((u64)((fib - im->fibs))<<32) | address_length))
+ && ip6_destination_matches_route
+ (im, dst_address, (ip6_address_t *) &v->kvp[k],
+ address_length))
+ {
+ ip6_address_t * a;
+
+ a = (ip6_address_t *)(&v->kvp[k]);
+
+ vec_add1 (*results, a[0]);
+ vec_add1 (*result_lengths, address_length);
+ }
+ }
+ v++;
+ }
+ }
+}
+
+void ip6_maybe_remap_adjacencies (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags)
+{
+#if SOONE
+ ip6_fib_t * fib
+ = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags);
+#endif
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ if (lm->n_adjacency_remaps == 0)
+ return;
+
+ clib_warning ("unimplemented, please report to vpp-dev@cisco.com");
+
+ /* All remaps have been performed. */
+ lm->n_adjacency_remaps = 0;
+}
+
+void ip6_delete_matching_routes (ip6_main_t * im,
+ u32 table_index_or_table_id,
+ u32 flags,
+ ip6_address_t * address,
+ u32 address_length)
+{
+ /* $$$$ static may be OK - this should happen only on thread 0 */
+ static ip6_address_t * matching_addresses;
+ static u8 * matching_address_lengths;
+ u32 l, i;
+ ip6_add_del_route_args_t a;
+
+ vlib_smp_unsafe_warning();
+
+ a.flags = IP6_ROUTE_FLAG_DEL | IP6_ROUTE_FLAG_NO_REDISTRIBUTE | flags;
+ a.table_index_or_table_id = table_index_or_table_id;
+ a.adj_index = ~0;
+ a.add_adj = 0;
+ a.n_add_adj = 0;
+
+ for (l = address_length + 1; l <= 128; l++)
+ {
+ ip6_foreach_matching_route (im, table_index_or_table_id, flags,
+ address,
+ l,
+ &matching_addresses,
+ &matching_address_lengths);
+ for (i = 0; i < vec_len (matching_addresses); i++)
+ {
+ a.dst_address = matching_addresses[i];
+ a.dst_address_length = matching_address_lengths[i];
+ ip6_add_del_route (im, &a);
+ }
+ }
+
+ ip6_maybe_remap_adjacencies (im, table_index_or_table_id, flags);
+}
+
+static uword
+ip6_lookup (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters;
+ u32 n_left_from, n_left_to_next, * from, * to_next;
+ ip_lookup_next_t next;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ u32 pi0, pi1, adj_index0, adj_index1, wrong_next;
+ ip_lookup_next_t next0, next1;
+ ip6_header_t * ip0, * ip1;
+ ip_adjacency_t * adj0, * adj1;
+ u32 fib_index0, fib_index1;
+ u32 flow_hash_config0, flow_hash_config1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]);
+
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+ fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX];
+
+ adj_index0 = ip6_fib_lookup_with_table (im, fib_index0,
+ &ip0->dst_address);
+ adj_index1 = ip6_fib_lookup_with_table (im, fib_index1,
+ &ip1->dst_address);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ if (PREDICT_FALSE (adj0->explicit_fib_index != ~0))
+ {
+ adj_index0 = ip6_fib_lookup_with_table
+ (im, adj0->explicit_fib_index, &ip0->dst_address);
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ }
+ if (PREDICT_FALSE (adj1->explicit_fib_index != ~0))
+ {
+ adj_index1 = ip6_fib_lookup_with_table
+ (im, adj1->explicit_fib_index, &ip1->dst_address);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+ }
+
+ next0 = adj0->lookup_next_index;
+ next1 = adj1->lookup_next_index;
+
+ /* Process hop-by-hop options if present */
+ next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ?
+ IP_LOOKUP_NEXT_HOP_BY_HOP : next0;
+ next1 = (ip1->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ?
+ IP_LOOKUP_NEXT_HOP_BY_HOP : next1;
+
+ vnet_buffer (p0)->ip.flow_hash =
+ vnet_buffer(p1)->ip.flow_hash = 0;
+
+ if (PREDICT_FALSE(adj0->n_adj > 1))
+ {
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+ vnet_buffer (p0)->ip.flow_hash =
+ ip6_compute_flow_hash (ip0, flow_hash_config0);
+ }
+
+ if (PREDICT_FALSE(adj1->n_adj > 1))
+ {
+ flow_hash_config1 =
+ vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+
+ vnet_buffer (p1)->ip.flow_hash =
+ ip6_compute_flow_hash (ip1, flow_hash_config1);
+ }
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (adj1->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ ASSERT (is_pow2 (adj1->n_adj));
+ adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+ adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+ vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1;
+
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0));
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index1, 1,
+ vlib_buffer_length_in_chain (vm, p1));
+
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ wrong_next = (next0 != next) + 2*(next1 != next);
+ if (PREDICT_FALSE (wrong_next != 0))
+ {
+ switch (wrong_next)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = pi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ break;
+
+ case 3:
+ /* A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, pi0);
+ vlib_set_next_frame_buffer (vm, node, next1, pi1);
+ if (next0 == next1)
+ {
+ /* A B B */
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next1;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ u32 pi0, adj_index0;
+ ip_lookup_next_t next0;
+ ip_adjacency_t * adj0;
+ u32 fib_index0, flow_hash_config0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]);
+ fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ?
+ fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX];
+
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+
+ adj_index0 = ip6_fib_lookup_with_table (im, fib_index0,
+ &ip0->dst_address);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ if (PREDICT_FALSE (adj0->explicit_fib_index != ~0))
+ {
+ adj_index0 = ip6_fib_lookup_with_table
+ (im, adj0->explicit_fib_index, &ip0->dst_address);
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ }
+
+ next0 = adj0->lookup_next_index;
+ next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ?
+ IP_LOOKUP_NEXT_HOP_BY_HOP : next0;
+
+ vnet_buffer (p0)->ip.flow_hash = 0;
+
+ if (PREDICT_FALSE(adj0->n_adj > 1))
+ {
+ flow_hash_config0 =
+ vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config;
+ vnet_buffer (p0)->ip.flow_hash =
+ ip6_compute_flow_hash (ip0, flow_hash_config0);
+ }
+
+ ASSERT (adj0->n_adj > 0);
+ ASSERT (is_pow2 (adj0->n_adj));
+ adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1));
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0;
+
+ vlib_increment_combined_counter
+ (cm, cpu_index, adj_index0, 1,
+ vlib_buffer_length_in_chain (vm, p0));
+
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ if (PREDICT_FALSE (next0 != next))
+ {
+ n_left_to_next += 1;
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ next = next0;
+ vlib_get_next_frame (vm, node, next,
+ to_next, n_left_to_next);
+ to_next[0] = pi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+void ip6_adjacency_set_interface_route (vnet_main_t * vnm,
+ ip_adjacency_t * adj,
+ u32 sw_if_index,
+ u32 if_address_index)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ ip_lookup_next_t n;
+ u32 node_index;
+
+ if (hw->hw_class_index == ethernet_hw_interface_class.index
+ || hw->hw_class_index == srp_hw_interface_class.index)
+ {
+ n = IP_LOOKUP_NEXT_ARP;
+ node_index = ip6_discover_neighbor_node.index;
+ adj->if_address_index = if_address_index;
+ }
+ else
+ {
+ n = IP_LOOKUP_NEXT_REWRITE;
+ node_index = ip6_rewrite_node.index;
+ }
+
+ adj->lookup_next_index = n;
+ adj->explicit_fib_index = ~0;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP6,
+ sw_if_index,
+ node_index,
+ VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST,
+ &adj->rewrite_header,
+ sizeof (adj->rewrite_data));
+}
+
+static void
+ip6_add_interface_routes (vnet_main_t * vnm, u32 sw_if_index,
+ ip6_main_t * im, u32 fib_index,
+ ip_interface_address_t * a)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_adjacency_t * adj;
+ ip6_address_t * address = ip_interface_address_get_address (lm, a);
+ ip6_add_del_route_args_t x;
+ vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ u32 classify_table_index;
+
+ /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
+ x.table_index_or_table_id = fib_index;
+ x.flags = (IP6_ROUTE_FLAG_ADD
+ | IP6_ROUTE_FLAG_FIB_INDEX
+ | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
+ x.dst_address = address[0];
+ x.dst_address_length = a->address_length;
+ x.n_add_adj = 0;
+ x.add_adj = 0;
+
+ a->neighbor_probe_adj_index = ~0;
+ if (a->address_length < 128)
+ {
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &x.adj_index);
+ ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool);
+ ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
+ ip6_add_del_route (im, &x);
+ a->neighbor_probe_adj_index = x.adj_index;
+ }
+
+ /* Add e.g. ::1/128 as local to this host. */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &x.adj_index);
+
+ classify_table_index = ~0;
+ if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index))
+ classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index];
+ if (classify_table_index != (u32) ~0)
+ {
+ adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
+ adj->classify_table_index = classify_table_index;
+ }
+ else
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+
+ adj->if_address_index = a - lm->if_address_pool;
+ adj->rewrite_header.sw_if_index = sw_if_index;
+ adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX];
+ adj->rewrite_header.data_bytes = 0;
+ ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0);
+ x.dst_address_length = 128;
+ ip6_add_del_route (im, &x);
+}
+
+static void
+ip6_del_interface_routes (ip6_main_t * im, u32 fib_index,
+ ip6_address_t * address, u32 address_length)
+{
+ ip6_add_del_route_args_t x;
+
+ /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */
+ x.table_index_or_table_id = fib_index;
+ x.flags = (IP6_ROUTE_FLAG_DEL
+ | IP6_ROUTE_FLAG_FIB_INDEX
+ | IP6_ROUTE_FLAG_NO_REDISTRIBUTE);
+ x.dst_address = address[0];
+ x.dst_address_length = address_length;
+ x.adj_index = ~0;
+ x.n_add_adj = 0;
+ x.add_adj = 0;
+
+ if (address_length < 128)
+ {
+ /* Don't wipe out fe80::0/64 */
+ if (address_length != 64 ||
+ address[0].as_u64[0] != clib_net_to_host_u64(0xfe80000000000000ULL))
+ ip6_add_del_route (im, &x);
+ }
+
+ x.dst_address_length = 128;
+ ip6_add_del_route (im, &x);
+
+ ip6_delete_matching_routes (im,
+ fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX,
+ address,
+ address_length);
+}
+
+typedef struct {
+ u32 sw_if_index;
+ ip6_address_t address;
+ u32 length;
+} ip6_interface_address_t;
+
+static clib_error_t *
+ip6_add_del_interface_address_internal (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * new_address,
+ u32 new_length,
+ u32 redistribute,
+ u32 insert_routes,
+ u32 is_del);
+
+static clib_error_t *
+ip6_add_del_interface_address_internal (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * address,
+ u32 address_length,
+ u32 redistribute,
+ u32 insert_routes,
+ u32 is_del)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ clib_error_t * error;
+ u32 if_address_index;
+ ip6_address_fib_t ip6_af, * addr_fib = 0;
+
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+ ip6_addr_fib_init (&ip6_af, address,
+ vec_elt (im->fib_index_by_sw_if_index, sw_if_index));
+ vec_add1 (addr_fib, ip6_af);
+
+ {
+ uword elts_before = pool_elts (lm->if_address_pool);
+
+ error = ip_interface_address_add_del
+ (lm,
+ sw_if_index,
+ addr_fib,
+ address_length,
+ is_del,
+ &if_address_index);
+ if (error)
+ goto done;
+
+ /* Pool did not grow: add duplicate address. */
+ if (elts_before == pool_elts (lm->if_address_pool))
+ goto done;
+ }
+
+ if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes)
+ {
+ if (is_del)
+ ip6_del_interface_routes (im, ip6_af.fib_index, address,
+ address_length);
+
+ else
+ ip6_add_interface_routes (vnm, sw_if_index,
+ im, ip6_af.fib_index,
+ pool_elt_at_index (lm->if_address_pool, if_address_index));
+ }
+
+ {
+ ip6_add_del_interface_address_callback_t * cb;
+ vec_foreach (cb, im->add_del_interface_address_callbacks)
+ cb->function (im, cb->function_opaque, sw_if_index,
+ address, address_length,
+ if_address_index,
+ is_del);
+ }
+
+ done:
+ vec_free (addr_fib);
+ return error;
+}
+
+clib_error_t *
+ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index,
+ ip6_address_t * address, u32 address_length,
+ u32 is_del)
+{
+ return ip6_add_del_interface_address_internal
+ (vm, sw_if_index, address, address_length,
+ /* redistribute */ 1,
+ /* insert_routes */ 1,
+ is_del);
+}
+
+clib_error_t *
+ip6_sw_interface_admin_up_down (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 flags)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_interface_address_t * ia;
+ ip6_address_t * a;
+ u32 is_admin_up, fib_index;
+
+ /* Fill in lookup tables with default table (0). */
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+
+ vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+
+ fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index);
+
+ foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index,
+ 0 /* honor unnumbered */,
+ ({
+ a = ip_interface_address_get_address (&im->lookup_main, ia);
+ if (is_admin_up)
+ ip6_add_interface_routes (vnm, sw_if_index,
+ im, fib_index,
+ ia);
+ else
+ ip6_del_interface_routes (im, fib_index,
+ a, ia->address_length);
+ }));
+
+ return 0;
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip6_sw_interface_admin_up_down);
+
+clib_error_t *
+ip6_sw_interface_add_del (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 is_add)
+{
+ vlib_main_t * vm = vnm->vlib_main;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 ci, cast;
+
+ for (cast = 0; cast < VNET_N_CAST; cast++)
+ {
+ ip_config_main_t * cm = &lm->rx_config_mains[cast];
+ vnet_config_main_t * vcm = &cm->config_main;
+
+ /* FIXME multicast. */
+ if (! vcm->node_index_by_feature_index)
+ {
+ char * start_nodes[] = { "ip6-input", };
+ char * feature_nodes[] = {
+ [IP6_RX_FEATURE_CHECK_ACCESS] = "ip6-inacl",
+ [IP6_RX_FEATURE_IPSEC] = "ipsec-input-ip6",
+ [IP6_RX_FEATURE_L2TPV3] = "l2tp-decap",
+ [IP6_RX_FEATURE_VPATH] = "vpath-input-ip6",
+ [IP6_RX_FEATURE_LOOKUP] = "ip6-lookup",
+ };
+ vnet_config_init (vm, vcm,
+ start_nodes, ARRAY_LEN (start_nodes),
+ feature_nodes, ARRAY_LEN (feature_nodes));
+ }
+
+ vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0);
+ ci = cm->config_index_by_sw_if_index[sw_if_index];
+
+ if (is_add)
+ ci = vnet_config_add_feature (vm, vcm,
+ ci,
+ IP6_RX_FEATURE_LOOKUP,
+ /* config data */ 0,
+ /* # bytes of config data */ 0);
+ else
+ ci = vnet_config_del_feature (vm, vcm,
+ ci,
+ IP6_RX_FEATURE_LOOKUP,
+ /* config data */ 0,
+ /* # bytes of config data */ 0);
+
+ cm->config_index_by_sw_if_index[sw_if_index] = ci;
+ }
+ return /* no error */ 0;
+}
+
+VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip6_sw_interface_add_del);
+
+VLIB_REGISTER_NODE (ip6_lookup_node) = {
+ .function = ip6_lookup,
+ .name = "ip6-lookup",
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip6-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip6-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip6-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd",
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop",
+ },
+};
+
+typedef struct {
+ /* Adjacency taken. */
+ u32 adj_index;
+ u32 flow_hash;
+
+ /* Packet data, possibly *after* rewrite. */
+ u8 packet_data[64 - 1*sizeof(u32)];
+} ip6_forward_next_trace_t;
+
+static u8 * format_ip6_forward_next_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip6_forward_next_trace_t * t = va_arg (*args, ip6_forward_next_trace_t *);
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_adjacency_t * adj;
+ uword indent = format_get_indent (s);
+
+ adj = ip_get_adjacency (&im->lookup_main, t->adj_index);
+ s = format (s, "adjacency: %U flow hash: 0x%08x",
+ format_ip_adjacency,
+ vnm, &im->lookup_main, t->adj_index, t->flow_hash);
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_REWRITE:
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_ip_adjacency_packet_data,
+ vnm, &im->lookup_main, t->adj_index,
+ t->packet_data, sizeof (t->packet_data));
+ break;
+
+ default:
+ break;
+ }
+
+ return s;
+}
+
+/* Common trace function for all ip6-forward next nodes. */
+void
+ip6_forward_next_trace (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ vlib_rx_or_tx_t which_adj_index)
+{
+ u32 * from, n_left;
+
+ n_left = frame->n_vectors;
+ from = vlib_frame_vector_args (frame);
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ip6_forward_next_trace_t * t0, * t1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+ vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+ bi0 = from[0];
+ bi1 = from[1];
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
+ t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
+ memcpy (t0->packet_data,
+ vlib_buffer_get_current (b0),
+ sizeof (t0->packet_data));
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+ t1->adj_index = vnet_buffer (b1)->ip.adj_index[which_adj_index];
+ t1->flow_hash = vnet_buffer (b1)->ip.flow_hash;
+ memcpy (t1->packet_data,
+ vlib_buffer_get_current (b1),
+ sizeof (t1->packet_data));
+ }
+ from += 2;
+ n_left -= 2;
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip6_forward_next_trace_t * t0;
+
+ bi0 = from[0];
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index];
+ t0->flow_hash = vnet_buffer (b0)->ip.flow_hash;
+ memcpy (t0->packet_data,
+ vlib_buffer_get_current (b0),
+ sizeof (t0->packet_data));
+ }
+ from += 1;
+ n_left -= 1;
+ }
+}
+
+static uword
+ip6_drop_or_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ ip6_error_t error_code)
+{
+ u32 * buffers = vlib_frame_vector_args (frame);
+ uword n_packets = frame->n_vectors;
+
+ vlib_error_drop_buffers (vm, node,
+ buffers,
+ /* stride */ 1,
+ n_packets,
+ /* next */ 0,
+ ip6_input_node.index,
+ error_code);
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip6_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ return n_packets;
+}
+
+static uword
+ip6_drop (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_ADJACENCY_DROP); }
+
+static uword
+ip6_punt (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_ADJACENCY_PUNT); }
+
+static uword
+ip6_miss (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_DST_LOOKUP_MISS); }
+
+VLIB_REGISTER_NODE (ip6_drop_node,static) = {
+ .function = ip6_drop,
+ .name = "ip6-drop",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_punt_node,static) = {
+ .function = ip6_punt,
+ .name = "ip6-punt",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-punt",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_miss_node,static) = {
+ .function = ip6_miss,
+ .name = "ip6-miss",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_multicast_node,static) = {
+ .function = ip6_drop,
+ .name = "ip6-multicast",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+/* Compute TCP/UDP/ICMP6 checksum in software. */
+u16 ip6_tcp_udp_icmp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip6_header_t * ip0, int *bogus_lengthp)
+{
+ ip_csum_t sum0;
+ u16 sum16, payload_length_host_byte_order;
+ u32 i, n_this_buffer, n_bytes_left;
+ u32 headers_size = sizeof(ip0[0]);
+ void * data_this_buffer;
+
+ ASSERT(bogus_lengthp);
+ *bogus_lengthp = 0;
+
+ /* Initialize checksum with ip header. */
+ sum0 = ip0->payload_length + clib_host_to_net_u16 (ip0->protocol);
+ payload_length_host_byte_order = clib_net_to_host_u16 (ip0->payload_length);
+ data_this_buffer = (void *) (ip0 + 1);
+
+ for (i = 0; i < ARRAY_LEN (ip0->src_address.as_uword); i++)
+ {
+ sum0 = ip_csum_with_carry (sum0,
+ clib_mem_unaligned (&ip0->src_address.as_uword[i], uword));
+ sum0 = ip_csum_with_carry (sum0,
+ clib_mem_unaligned (&ip0->dst_address.as_uword[i], uword));
+ }
+
+ /* some icmp packets may come with a "router alert" hop-by-hop extension header (e.g., mldv2 packets) */
+ if (PREDICT_FALSE (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS))
+ {
+ u32 skip_bytes;
+ ip6_hop_by_hop_ext_t *ext_hdr = (ip6_hop_by_hop_ext_t *)data_this_buffer;
+
+ /* validate really icmp6 next */
+ ASSERT(ext_hdr->next_hdr == IP_PROTOCOL_ICMP6);
+
+ skip_bytes = 8* (1 + ext_hdr->n_data_u64s);
+ data_this_buffer = (void *)((u8 *)data_this_buffer + skip_bytes);
+
+ payload_length_host_byte_order -= skip_bytes;
+ headers_size += skip_bytes;
+ }
+
+ n_bytes_left = n_this_buffer = payload_length_host_byte_order;
+#if DPDK > 0
+ if (p0)
+ {
+ struct rte_mbuf *mb = ((struct rte_mbuf *)p0)-1;
+ u8 nb_segs = mb->nb_segs;
+
+ n_this_buffer = (p0->current_length > headers_size ?
+ p0->current_length - headers_size : 0);
+ while (n_bytes_left)
+ {
+ sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
+ n_bytes_left -= n_this_buffer;
+
+ mb = mb->next;
+ nb_segs--;
+ if ((nb_segs == 0) || (mb == 0))
+ break;
+
+ data_this_buffer = rte_ctrlmbuf_data(mb);
+ n_this_buffer = mb->data_len;
+ }
+ if (n_bytes_left || nb_segs)
+ {
+ *bogus_lengthp = 1;
+ return 0xfefe;
+ }
+ }
+ else sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
+#else
+ if (p0 && n_this_buffer + headers_size > p0->current_length)
+ n_this_buffer = p0->current_length > headers_size ? p0->current_length - headers_size : 0;
+ while (1)
+ {
+ sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer);
+ n_bytes_left -= n_this_buffer;
+ if (n_bytes_left == 0)
+ break;
+
+ if (!(p0->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ *bogus_lengthp = 1;
+ return 0xfefe;
+ }
+ p0 = vlib_get_buffer (vm, p0->next_buffer);
+ data_this_buffer = vlib_buffer_get_current (p0);
+ n_this_buffer = p0->current_length;
+ }
+#endif /* DPDK */
+
+ sum16 = ~ ip_csum_fold (sum0);
+
+ return sum16;
+}
+
+u32 ip6_tcp_udp_icmp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0)
+{
+ ip6_header_t * ip0 = vlib_buffer_get_current (p0);
+ udp_header_t * udp0;
+ u16 sum16;
+ int bogus_length;
+
+ /* some icmp packets may come with a "router alert" hop-by-hop extension header (e.g., mldv2 packets) */
+ ASSERT (ip0->protocol == IP_PROTOCOL_TCP
+ || ip0->protocol == IP_PROTOCOL_ICMP6
+ || ip0->protocol == IP_PROTOCOL_UDP
+ || ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS);
+
+ udp0 = (void *) (ip0 + 1);
+ if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0)
+ {
+ p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
+ | IP_BUFFER_L4_CHECKSUM_CORRECT);
+ return p0->flags;
+ }
+
+ sum16 = ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, &bogus_length);
+
+ p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED
+ | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT));
+
+ return p0->flags;
+}
+
+static uword
+ip6_local (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_local_next_t next_index;
+ u32 * from, * to_next, n_left_from, n_left_to_next;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip6_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip6_header_t * ip0, * ip1;
+ udp_header_t * udp0, * udp1;
+ u32 pi0, ip_len0, udp_len0, flags0, next0;
+ u32 pi1, ip_len1, udp_len1, flags1, next1;
+ i32 len_diff0, len_diff1;
+ u8 error0, type0, good_l4_checksum0;
+ u8 error1, type1, good_l4_checksum1;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol];
+ type1 = lm->builtin_protocol_by_ip_protocol[ip1->protocol];
+
+ next0 = lm->local_next_by_ip_protocol[ip0->protocol];
+ next1 = lm->local_next_by_ip_protocol[ip1->protocol];
+
+ flags0 = p0->flags;
+ flags1 = p1->flags;
+
+ good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ good_l4_checksum1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+
+ udp0 = ip6_next_header (ip0);
+ udp1 = ip6_next_header (ip1);
+
+ /* Don't verify UDP checksum for packets with explicit zero checksum. */
+ good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0;
+ good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UDP && udp1->checksum == 0;
+
+ good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN;
+ good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UNKNOWN;
+
+ /* Verify UDP length. */
+ ip_len0 = clib_net_to_host_u16 (ip0->payload_length);
+ ip_len1 = clib_net_to_host_u16 (ip1->payload_length);
+ udp_len0 = clib_net_to_host_u16 (udp0->length);
+ udp_len1 = clib_net_to_host_u16 (udp1->length);
+
+ len_diff0 = ip_len0 - udp_len0;
+ len_diff1 = ip_len1 - udp_len1;
+
+ len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0;
+ len_diff1 = type1 == IP_BUILTIN_PROTOCOL_UDP ? len_diff1 : 0;
+
+ if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN
+ && ! good_l4_checksum0
+ && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)))
+ {
+ flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0);
+ good_l4_checksum0 =
+ (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ }
+ if (PREDICT_FALSE (type1 != IP_BUILTIN_PROTOCOL_UNKNOWN
+ && ! good_l4_checksum1
+ && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED)))
+ {
+ flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, p1);
+ good_l4_checksum1 =
+ (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ }
+
+ error0 = error1 = IP6_ERROR_UNKNOWN_PROTOCOL;
+
+ error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0;
+ error1 = len_diff1 < 0 ? IP6_ERROR_UDP_LENGTH : error1;
+
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_UDP == IP6_ERROR_UDP_CHECKSUM);
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_TCP == IP6_ERROR_TCP_CHECKSUM);
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM);
+ error0 = (! good_l4_checksum0
+ ? IP6_ERROR_UDP_CHECKSUM + type0
+ : error0);
+ error1 = (! good_l4_checksum1
+ ? IP6_ERROR_UDP_CHECKSUM + type1
+ : error1);
+
+ /* Drop packets from unroutable hosts. */
+ /* If this is a neighbor solicitation (ICMP), skip source RPF check */
+ if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP)
+ {
+ u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
+ error0 = (lm->miss_adj_index == src_adj_index0
+ ? IP6_ERROR_SRC_LOOKUP_MISS
+ : error0);
+ }
+ if (error1 == IP6_ERROR_UNKNOWN_PROTOCOL && type1 != IP_BUILTIN_PROTOCOL_ICMP)
+ {
+ u32 src_adj_index1 = ip6_src_lookup_for_packet (im, p1, ip1);
+ error1 = (lm->miss_adj_index == src_adj_index1
+ ? IP6_ERROR_SRC_LOOKUP_MISS
+ : error1);
+ }
+
+ next0 = error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
+ next1 = error1 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1;
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ udp_header_t * udp0;
+ u32 pi0, ip_len0, udp_len0, flags0, next0;
+ i32 len_diff0;
+ u8 error0, type0, good_l4_checksum0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol];
+ next0 = lm->local_next_by_ip_protocol[ip0->protocol];
+
+ flags0 = p0->flags;
+
+ good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+
+ udp0 = ip6_next_header (ip0);
+
+ /* Don't verify UDP checksum for packets with explicit zero checksum. */
+ good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0;
+
+ good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN;
+
+ /* Verify UDP length. */
+ ip_len0 = clib_net_to_host_u16 (ip0->payload_length);
+ udp_len0 = clib_net_to_host_u16 (udp0->length);
+
+ len_diff0 = ip_len0 - udp_len0;
+
+ len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0;
+
+ if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN
+ && ! good_l4_checksum0
+ && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)))
+ {
+ flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0);
+ good_l4_checksum0 =
+ (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0;
+ }
+
+ error0 = IP6_ERROR_UNKNOWN_PROTOCOL;
+
+ error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0;
+
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_UDP == IP6_ERROR_UDP_CHECKSUM);
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_TCP == IP6_ERROR_TCP_CHECKSUM);
+ ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM);
+ error0 = (! good_l4_checksum0
+ ? IP6_ERROR_UDP_CHECKSUM + type0
+ : error0);
+
+ /* If this is a neighbor solicitation (ICMP), skip source RPF check */
+ if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP)
+ {
+ u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
+ error0 = (lm->miss_adj_index == src_adj_index0
+ ? IP6_ERROR_SRC_LOOKUP_MISS
+ : error0);
+ }
+
+ next0 = error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0;
+
+ p0->error = error_node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_local_node,static) = {
+ .function = ip6_local,
+ .name = "ip6-local",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = IP_LOCAL_N_NEXT,
+ .next_nodes = {
+ [IP_LOCAL_NEXT_DROP] = "error-drop",
+ [IP_LOCAL_NEXT_PUNT] = "error-punt",
+ // [IP_LOCAL_NEXT_TCP_LOOKUP] = "ip6-tcp-lookup",
+ [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip6-udp-lookup",
+ [IP_LOCAL_NEXT_ICMP] = "ip6-icmp-input",
+ },
+};
+
+void ip6_register_protocol (u32 protocol, u32 node_index)
+{
+ vlib_main_t * vm = vlib_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol));
+ lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip6_local_node.index, node_index);
+}
+
+typedef enum {
+ IP6_DISCOVER_NEIGHBOR_NEXT_DROP,
+ IP6_DISCOVER_NEIGHBOR_N_NEXT,
+} ip6_discover_neighbor_next_t;
+
+typedef enum {
+ IP6_DISCOVER_NEIGHBOR_ERROR_DROP,
+ IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT,
+} ip6_discover_neighbor_error_t;
+
+static uword
+ip6_discover_neighbor (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 * from, * to_next_drop;
+ uword n_left_from, n_left_to_next_drop;
+ static f64 time_last_seed_change = -1e100;
+ static u32 hash_seeds[3];
+ static uword hash_bitmap[256 / BITS (uword)];
+ f64 time_now;
+ int bogus_length;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip6_forward_next_trace (vm, node, frame, VLIB_TX);
+
+ time_now = vlib_time_now (vm);
+ if (time_now - time_last_seed_change > 1e-3)
+ {
+ uword i;
+ u32 * r = clib_random_buffer_get_data (&vm->random_buffer,
+ sizeof (hash_seeds));
+ for (i = 0; i < ARRAY_LEN (hash_seeds); i++)
+ hash_seeds[i] = r[i];
+
+ /* Mark all hash keys as been not-seen before. */
+ for (i = 0; i < ARRAY_LEN (hash_bitmap); i++)
+ hash_bitmap[i] = 0;
+
+ time_last_seed_change = time_now;
+ }
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, IP6_DISCOVER_NEIGHBOR_NEXT_DROP,
+ to_next_drop, n_left_to_next_drop);
+
+ while (n_left_from > 0 && n_left_to_next_drop > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0;
+ uword bm0;
+ ip_adjacency_t * adj0;
+ vnet_hw_interface_t * hw_if0;
+ u32 next0;
+
+ pi0 = from[0];
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX];
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ a0 = hash_seeds[0];
+ b0 = hash_seeds[1];
+ c0 = hash_seeds[2];
+
+ sw_if_index0 = adj0->rewrite_header.sw_if_index;
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+
+ a0 ^= sw_if_index0;
+ b0 ^= ip0->dst_address.as_u32[0];
+ c0 ^= ip0->dst_address.as_u32[1];
+
+ hash_v3_mix32 (a0, b0, c0);
+
+ b0 ^= ip0->dst_address.as_u32[2];
+ c0 ^= ip0->dst_address.as_u32[3];
+
+ hash_v3_finalize32 (a0, b0, c0);
+
+ c0 &= BITS (hash_bitmap) - 1;
+ c0 = c0 / BITS (uword);
+ m0 = (uword) 1 << (c0 % BITS (uword));
+
+ bm0 = hash_bitmap[c0];
+ drop0 = (bm0 & m0) != 0;
+
+ /* Mark it as seen. */
+ hash_bitmap[c0] = bm0 | m0;
+
+ from += 1;
+ n_left_from -= 1;
+ to_next_drop[0] = pi0;
+ to_next_drop += 1;
+ n_left_to_next_drop -= 1;
+
+ hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ /* If the interface is link-down, drop the pkt */
+ if (!(hw_if0->flags & VNET_HW_INTERFACE_FLAG_LINK_UP))
+ drop0 = 1;
+
+ p0->error =
+ node->errors[drop0 ? IP6_DISCOVER_NEIGHBOR_ERROR_DROP
+ : IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT];
+ if (drop0)
+ continue;
+
+ {
+ u32 bi0 = 0;
+ icmp6_neighbor_solicitation_header_t * h0;
+ vlib_buffer_t * b0;
+
+ h0 = vlib_packet_template_get_packet
+ (vm, &im->discover_neighbor_packet_template, &bi0);
+
+ /*
+ * Build ethernet header.
+ * Choose source address based on destination lookup
+ * adjacency.
+ */
+ ip6_src_address_for_packet (im, p0, &h0->ip.src_address,
+ sw_if_index0);
+
+ /*
+ * Destination address is a solicited node multicast address.
+ * We need to fill in
+ * the low 24 bits with low 24 bits of target's address.
+ */
+ h0->ip.dst_address.as_u8[13] = ip0->dst_address.as_u8[13];
+ h0->ip.dst_address.as_u8[14] = ip0->dst_address.as_u8[14];
+ h0->ip.dst_address.as_u8[15] = ip0->dst_address.as_u8[15];
+
+ h0->neighbor.target_address = ip0->dst_address;
+
+ memcpy (h0->link_layer_option.ethernet_address,
+ hw_if0->hw_address, vec_len (hw_if0->hw_address));
+
+ /* $$$$ appears we need this; why is the checksum non-zero? */
+ h0->neighbor.icmp.checksum = 0;
+ h0->neighbor.icmp.checksum =
+ ip6_tcp_udp_icmp_compute_checksum (vm, 0, &h0->ip,
+ &bogus_length);
+
+ ASSERT (bogus_length == 0);
+
+ vlib_buffer_copy_trace_flag (vm, p0, bi0);
+ b0 = vlib_get_buffer (vm, bi0);
+ vnet_buffer (b0)->sw_if_index[VLIB_TX]
+ = vnet_buffer (p0)->sw_if_index[VLIB_TX];
+
+ /* Add rewrite/encap string. */
+ vnet_rewrite_one_header (adj0[0], h0,
+ sizeof (ethernet_header_t));
+ vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
+
+ /* $$$$ hack in case next0 == 0 */
+ b0->error = node->errors[IP6_DISCOVER_NEIGHBOR_ERROR_DROP];
+ next0 =
+ vec_elt (im->discover_neighbor_next_index_by_hw_if_index,
+ hw_if0->hw_if_index);
+
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, IP6_DISCOVER_NEIGHBOR_NEXT_DROP,
+ n_left_to_next_drop);
+ }
+
+ return frame->n_vectors;
+}
+
+static char * ip6_discover_neighbor_error_strings[] = {
+ [IP6_DISCOVER_NEIGHBOR_ERROR_DROP] = "address overflow drops",
+ [IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT]
+ = "neighbor solicitations sent",
+};
+
+VLIB_REGISTER_NODE (ip6_discover_neighbor_node) = {
+ .function = ip6_discover_neighbor,
+ .name = "ip6-discover-neighbor",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_errors = ARRAY_LEN (ip6_discover_neighbor_error_strings),
+ .error_strings = ip6_discover_neighbor_error_strings,
+
+ .n_next_nodes = IP6_DISCOVER_NEIGHBOR_N_NEXT,
+ .next_nodes = {
+ [IP6_DISCOVER_NEIGHBOR_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *
+ip6_discover_neighbor_hw_interface_link_up_down (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 flags)
+{
+ vlib_main_t * vm = vnm->vlib_main;
+ ip6_main_t * im = &ip6_main;
+ vnet_hw_interface_t * hw_if;
+
+ hw_if = vnet_get_hw_interface (vnm, hw_if_index);
+
+ vec_validate_init_empty
+ (im->discover_neighbor_next_index_by_hw_if_index, hw_if_index, 0);
+ im->discover_neighbor_next_index_by_hw_if_index[hw_if_index]
+ = vlib_node_add_next (vm, ip6_discover_neighbor_node.index,
+ hw_if->output_node_index);
+ return 0;
+}
+
+VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION
+(ip6_discover_neighbor_hw_interface_link_up_down);
+
+clib_error_t *
+ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ icmp6_neighbor_solicitation_header_t * h;
+ ip6_address_t * src;
+ ip_interface_address_t * ia;
+ ip_adjacency_t * adj;
+ vnet_hw_interface_t * hi;
+ vnet_sw_interface_t * si;
+ vlib_buffer_t * b;
+ u32 bi = 0;
+ int bogus_length;
+
+ si = vnet_get_sw_interface (vnm, sw_if_index);
+
+ if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ {
+ return clib_error_return (0, "%U: interface %U down",
+ format_ip6_address, dst,
+ format_vnet_sw_if_index_name, vnm,
+ sw_if_index);
+ }
+
+ src = ip6_interface_address_matching_destination (im, dst, sw_if_index, &ia);
+ if (! src)
+ {
+ vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE;
+ return clib_error_return
+ (0, "no matching interface address for destination %U (interface %U)",
+ format_ip6_address, dst,
+ format_vnet_sw_if_index_name, vnm, sw_if_index);
+ }
+
+ h = vlib_packet_template_get_packet (vm, &im->discover_neighbor_packet_template, &bi);
+
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ /* Destination address is a solicited node multicast address. We need to fill in
+ the low 24 bits with low 24 bits of target's address. */
+ h->ip.dst_address.as_u8[13] = dst->as_u8[13];
+ h->ip.dst_address.as_u8[14] = dst->as_u8[14];
+ h->ip.dst_address.as_u8[15] = dst->as_u8[15];
+
+ h->ip.src_address = src[0];
+ h->neighbor.target_address = dst[0];
+
+ memcpy (h->link_layer_option.ethernet_address, hi->hw_address, vec_len (hi->hw_address));
+
+ h->neighbor.icmp.checksum =
+ ip6_tcp_udp_icmp_compute_checksum (vm, 0, &h->ip, &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ b = vlib_get_buffer (vm, bi);
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
+
+ /* Add encapsulation string for software interface (e.g. ethernet header). */
+ adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index);
+ vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t));
+ vlib_buffer_advance (b, -adj->rewrite_header.data_bytes);
+
+ {
+ vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
+ u32 * to_next = vlib_frame_vector_args (f);
+ to_next[0] = bi;
+ f->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi->output_node_index, f);
+ }
+
+ return /* no error */ 0;
+}
+
+typedef enum {
+ IP6_REWRITE_NEXT_DROP,
+} ip6_rewrite_next_t;
+
+always_inline uword
+ip6_rewrite_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ int rewrite_for_locally_received_packets)
+{
+ ip_lookup_main_t * lm = &ip6_main.lookup_main;
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index);
+ vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ ip_adjacency_t * adj0, * adj1;
+ vlib_buffer_t * p0, * p1;
+ ip6_header_t * ip0, * ip1;
+ u32 pi0, rw_len0, next0, error0, adj_index0;
+ u32 pi1, rw_len1, next1, error1, adj_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->pre_data, 32, STORE);
+ CLIB_PREFETCH (p3->pre_data, 32, STORE);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE);
+ CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
+ adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx];
+
+ /* We should never rewrite a pkt using the MISS adjacency */
+ ASSERT(adj_index0 && adj_index1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ error0 = error1 = IP6_ERROR_NONE;
+
+ if (! rewrite_for_locally_received_packets)
+ {
+ i32 hop_limit0 = ip0->hop_limit, hop_limit1 = ip1->hop_limit;
+
+ /* Input node should have reject packets with hop limit 0. */
+ ASSERT (ip0->hop_limit > 0);
+ ASSERT (ip1->hop_limit > 0);
+
+ hop_limit0 -= 1;
+ hop_limit1 -= 1;
+
+ ip0->hop_limit = hop_limit0;
+ ip1->hop_limit = hop_limit1;
+
+ error0 = hop_limit0 <= 0 ? IP6_ERROR_TIME_EXPIRED : error0;
+ error1 = hop_limit1 <= 0 ? IP6_ERROR_TIME_EXPIRED : error1;
+ }
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ adj1 = ip_get_adjacency (lm, adj_index1);
+
+ if (rewrite_for_locally_received_packets)
+ {
+ /*
+ * If someone sends e.g. an icmp6 w/ src = dst = interface addr,
+ * we end up here with a local adjacency in hand
+ */
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
+ if (PREDICT_FALSE(adj1->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error1 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
+ }
+
+ rw_len0 = adj0[0].rewrite_header.data_bytes;
+ rw_len1 = adj1[0].rewrite_header.data_bytes;
+
+ vlib_increment_combined_counter (&lm->adjacency_counters,
+ cpu_index,
+ adj_index0,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len0);
+ vlib_increment_combined_counter (&lm->adjacency_counters,
+ cpu_index,
+ adj_index1,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len1);
+
+ /* Check MTU of outgoing interface. */
+ error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
+ ? IP6_ERROR_MTU_EXCEEDED
+ : error0);
+ error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes
+ ? IP6_ERROR_MTU_EXCEEDED
+ : error1);
+
+ p0->current_data -= rw_len0;
+ p1->current_data -= rw_len1;
+
+ p0->current_length += rw_len0;
+ p1->current_length += rw_len1;
+
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index;
+ vnet_buffer (p1)->sw_if_index[VLIB_TX] = adj1[0].rewrite_header.sw_if_index;
+
+ next0 = (error0 == IP6_ERROR_NONE) ?
+ adj0[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP;
+ next1 = (error1 == IP6_ERROR_NONE) ?
+ adj1[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP;
+
+ /* Guess we are only writing on simple Ethernet header. */
+ vnet_rewrite_two_headers (adj0[0], adj1[0],
+ ip0, ip1,
+ sizeof (ethernet_header_t));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ ip_adjacency_t * adj0;
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ u32 pi0, rw_len0;
+ u32 adj_index0, next0, error0;
+
+ pi0 = to_next[0] = from[0];
+
+ p0 = vlib_get_buffer (vm, pi0);
+
+ adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx];
+
+ /* We should never rewrite a pkt using the MISS adjacency */
+ ASSERT(adj_index0);
+
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ ip0 = vlib_buffer_get_current (p0);
+
+ error0 = IP6_ERROR_NONE;
+
+ /* Check hop limit */
+ if (! rewrite_for_locally_received_packets)
+ {
+ i32 hop_limit0 = ip0->hop_limit;
+
+ ASSERT (ip0->hop_limit > 0);
+
+ hop_limit0 -= 1;
+
+ ip0->hop_limit = hop_limit0;
+
+ error0 = hop_limit0 <= 0 ? IP6_ERROR_TIME_EXPIRED : error0;
+ }
+
+ if (rewrite_for_locally_received_packets)
+ {
+ if (PREDICT_FALSE(adj0->lookup_next_index
+ == IP_LOOKUP_NEXT_LOCAL))
+ error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS;
+ }
+
+ /* Guess we are only writing on simple Ethernet header. */
+ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
+
+ /* Update packet buffer attributes/set output interface. */
+ rw_len0 = adj0[0].rewrite_header.data_bytes;
+
+ vlib_increment_combined_counter (&lm->adjacency_counters,
+ cpu_index,
+ adj_index0,
+ /* packet increment */ 0,
+ /* byte increment */ rw_len0);
+
+ /* Check MTU of outgoing interface. */
+ error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes
+ ? IP6_ERROR_MTU_EXCEEDED
+ : error0);
+
+ p0->current_data -= rw_len0;
+ p0->current_length += rw_len0;
+ vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index;
+
+ next0 = (error0 == IP6_ERROR_NONE) ?
+ adj0[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP;
+
+ p0->error = error_node->errors[error0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Need to do trace after rewrites to pick up new packet data. */
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ ip6_forward_next_trace (vm, node, frame, adj_rx_tx);
+
+ return frame->n_vectors;
+}
+
+static uword
+ip6_rewrite_transit (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip6_rewrite_inline (vm, node, frame,
+ /* rewrite_for_locally_received_packets */ 0);
+}
+
+static uword
+ip6_rewrite_local (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip6_rewrite_inline (vm, node, frame,
+ /* rewrite_for_locally_received_packets */ 1);
+}
+
+VLIB_REGISTER_NODE (ip6_rewrite_node) = {
+ .function = ip6_rewrite_transit,
+ .name = "ip6-rewrite",
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [IP6_REWRITE_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_rewrite_local_node,static) = {
+ .function = ip6_rewrite_local,
+ .name = "ip6-rewrite-local",
+ .vector_size = sizeof (u32),
+
+ .sibling_of = "ip6-rewrite",
+
+ .format_trace = format_ip6_forward_next_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [IP6_REWRITE_NEXT_DROP] = "error-drop",
+ },
+};
+
+/* Global IP6 main. */
+ip6_main_t ip6_main;
+
+static clib_error_t *
+ip6_lookup_init (vlib_main_t * vm)
+{
+ ip6_main_t * im = &ip6_main;
+ uword i;
+
+ for (i = 0; i < ARRAY_LEN (im->fib_masks); i++)
+ {
+ u32 j, i0, i1;
+
+ i0 = i / 32;
+ i1 = i % 32;
+
+ for (j = 0; j < i0; j++)
+ im->fib_masks[i].as_u32[j] = ~0;
+
+ if (i1)
+ im->fib_masks[i].as_u32[i0] = clib_host_to_net_u32 (pow2_mask (i1) << (32 - i1));
+ }
+
+ ip_lookup_init (&im->lookup_main, /* is_ip6 */ 1);
+
+ if (im->lookup_table_nbuckets == 0)
+ im->lookup_table_nbuckets = IP6_FIB_DEFAULT_HASH_NUM_BUCKETS;
+
+ im->lookup_table_nbuckets = 1<< max_log2 (im->lookup_table_nbuckets);
+
+ if (im->lookup_table_size == 0)
+ im->lookup_table_size = IP6_FIB_DEFAULT_HASH_MEMORY_SIZE;
+
+ BV(clib_bihash_init) (&im->ip6_lookup_table, "ip6 lookup table",
+ im->lookup_table_nbuckets,
+ im->lookup_table_size);
+
+ /* Create FIB with index 0 and table id of 0. */
+ find_ip6_fib_by_table_index_or_id (im, /* table id */ 0, IP6_ROUTE_FLAG_TABLE_ID);
+
+ {
+ pg_node_t * pn;
+ pn = pg_get_node (ip6_lookup_node.index);
+ pn->unformat_edit = unformat_pg_ip6_header;
+ }
+
+ {
+ icmp6_neighbor_solicitation_header_t p;
+
+ memset (&p, 0, sizeof (p));
+
+ p.ip.ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28);
+ p.ip.payload_length = clib_host_to_net_u16 (sizeof (p)
+ - STRUCT_OFFSET_OF (icmp6_neighbor_solicitation_header_t, neighbor));
+ p.ip.protocol = IP_PROTOCOL_ICMP6;
+ p.ip.hop_limit = 255;
+ ip6_set_solicited_node_multicast_address (&p.ip.dst_address, 0);
+
+ p.neighbor.icmp.type = ICMP6_neighbor_solicitation;
+
+ p.link_layer_option.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address;
+ p.link_layer_option.header.n_data_u64s = sizeof (p.link_layer_option) / sizeof (u64);
+
+ vlib_packet_template_init (vm,
+ &im->discover_neighbor_packet_template,
+ &p, sizeof (p),
+ /* alloc chunk size */ 8,
+ "ip6 neighbor discovery");
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip6_lookup_init);
+
+static clib_error_t *
+add_del_ip6_interface_table (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index, table_id;
+
+ sw_if_index = ~0;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (unformat (input, "%d", &table_id))
+ ;
+ else
+ {
+ error = clib_error_return (0, "expected table id `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ {
+ ip6_main_t * im = &ip6_main;
+ ip6_fib_t * fib =
+ find_ip6_fib_by_table_index_or_id (im, table_id, IP6_ROUTE_FLAG_TABLE_ID);
+
+ if (fib)
+ {
+ vec_validate (im->fib_index_by_sw_if_index, sw_if_index);
+ im->fib_index_by_sw_if_index[sw_if_index] = fib->index;
+ }
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = {
+ .path = "set interface ip6 table",
+ .function = add_del_ip6_interface_table,
+ .short_help = "set interface ip6 table <intfc> <table-id>"
+};
+
+void
+ip6_link_local_address_from_ethernet_mac_address (ip6_address_t *ip,
+ u8 *mac)
+{
+ ip->as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL);
+ /* Invert the "u" bit */
+ ip->as_u8 [8] = mac[0] ^ (1<<1);
+ ip->as_u8 [9] = mac[1];
+ ip->as_u8 [10] = mac[2];
+ ip->as_u8 [11] = 0xFF;
+ ip->as_u8 [12] = 0xFE;
+ ip->as_u8 [13] = mac[3];
+ ip->as_u8 [14] = mac[4];
+ ip->as_u8 [15] = mac[5];
+}
+
+void
+ip6_ethernet_mac_address_from_link_local_address (u8 *mac,
+ ip6_address_t *ip)
+{
+ /* Invert the previously inverted "u" bit */
+ mac[0] = ip->as_u8 [8] ^ (1<<1);
+ mac[1] = ip->as_u8 [9];
+ mac[2] = ip->as_u8 [10];
+ mac[3] = ip->as_u8 [13];
+ mac[4] = ip->as_u8 [14];
+ mac[5] = ip->as_u8 [15];
+}
+
+static clib_error_t *
+test_ip6_link_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u8 mac[6];
+ ip6_address_t _a, *a = &_a;
+
+ if (unformat (input, "%U", unformat_ethernet_address, mac))
+ {
+ ip6_link_local_address_from_ethernet_mac_address (a, mac);
+ vlib_cli_output (vm, "Link local address: %U",
+ format_ip6_address, a);
+ ip6_ethernet_mac_address_from_link_local_address (mac, a);
+ vlib_cli_output (vm, "Original MAC address: %U",
+ format_ethernet_address, mac);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_link_command, static) = {
+ .path = "test ip6 link",
+ .function = test_ip6_link_command_fn,
+ .short_help = "test ip6 link <mac-address>",
+};
+
+int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config)
+{
+ ip6_main_t * im6 = &ip6_main;
+ ip6_fib_t * fib;
+ uword * p = hash_get (im6->fib_index_by_table_id, table_id);
+
+ if (p == 0)
+ return -1;
+
+ fib = vec_elt_at_index (im6->fibs, p[0]);
+
+ fib->flow_hash_config = flow_hash_config;
+ return 1;
+}
+
+static clib_error_t *
+set_ip6_flow_hash_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int matched = 0;
+ u32 table_id = 0;
+ u32 flow_hash_config = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table %d", &table_id))
+ matched = 1;
+#define _(a,v) \
+ else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;}
+ foreach_flow_hash_bit
+#undef _
+ else break;
+ }
+
+ if (matched == 0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ rv = vnet_set_ip6_flow_hash (table_id, flow_hash_config);
+ switch (rv)
+ {
+ case 1:
+ break;
+
+ case -1:
+ return clib_error_return (0, "no such FIB table %d", table_id);
+
+ default:
+ clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config);
+ break;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip6_flow_hash_command, static) = {
+ .path = "set ip6 flow-hash",
+ .short_help =
+ "set ip table flow-hash table <fib-id> src dst sport dport proto reverse",
+ .function = set_ip6_flow_hash_command_fn,
+};
+
+static clib_error_t *
+show_ip6_local_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ int i;
+
+ vlib_cli_output (vm, "Protocols handled by ip6_local");
+ for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++)
+ {
+ if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT)
+ vlib_cli_output (vm, "%d", i);
+ }
+ return 0;
+}
+
+
+
+VLIB_CLI_COMMAND (show_ip_local, static) = {
+ .path = "show ip6 local",
+ .function = show_ip6_local_command_fn,
+ .short_help = "Show ip6 local protocol table",
+};
+
+int vnet_set_ip6_classify_intfc (vlib_main_t * vm, u32 sw_if_index,
+ u32 table_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ ip6_main_t * ipm = &ip6_main;
+ ip_lookup_main_t * lm = &ipm->lookup_main;
+ vnet_classify_main_t * cm = &vnet_classify_main;
+
+ if (pool_is_free_index (im->sw_interfaces, sw_if_index))
+ return VNET_API_ERROR_NO_MATCHING_INTERFACE;
+
+ if (table_index != ~0 && pool_is_free_index (cm->tables, table_index))
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index);
+ lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index;
+
+ return 0;
+}
+
+static clib_error_t *
+set_ip6_classify_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 table_index = ~0;
+ int table_index_set = 0;
+ u32 sw_if_index = ~0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "table-index %d", &table_index))
+ table_index_set = 1;
+ else if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
+ vnet_get_main(), &sw_if_index))
+ ;
+ else
+ break;
+ }
+
+ if (table_index_set == 0)
+ return clib_error_return (0, "classify table-index must be specified");
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "interface / subif must be specified");
+
+ rv = vnet_set_ip6_classify_intfc (vm, sw_if_index, table_index);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_MATCHING_INTERFACE:
+ return clib_error_return (0, "No such interface");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "No such classifier table");
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip6_classify_command, static) = {
+ .path = "set ip6 classify",
+ .short_help =
+ "set ip6 classify intfc <int> table-index <index>",
+ .function = set_ip6_classify_command_fn,
+};
+
+static clib_error_t *
+ip6_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ ip6_main_t * im = &ip6_main;
+ uword heapsize = 0;
+ u32 tmp;
+ u32 nbuckets = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "hash-buckets %d", &tmp))
+ nbuckets = tmp;
+ else if (unformat (input, "heap-size %dm", &tmp))
+ heapsize = ((u64)tmp) << 20;
+ else if (unformat (input, "heap-size %dM", &tmp))
+ heapsize = ((u64)tmp) << 20;
+ else if (unformat (input, "heap-size %dg", &tmp))
+ heapsize = ((u64)tmp) << 30;
+ else if (unformat (input, "heap-size %dG", &tmp))
+ heapsize = ((u64)tmp) << 30;
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, input);
+ }
+
+ im->lookup_table_nbuckets = nbuckets;
+ im->lookup_table_size = heapsize;
+
+ return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (ip6_config, "ip6");
+
diff --git a/vnet/vnet/ip/ip6_hop_by_hop.c b/vnet/vnet/ip/ip6_hop_by_hop.c
new file mode 100644
index 00000000000..64edfd249c3
--- /dev/null
+++ b/vnet/vnet/ip/ip6_hop_by_hop.c
@@ -0,0 +1,1139 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vppinfra/hash.h>
+#include <vppinfra/error.h>
+#include <vppinfra/elog.h>
+
+#include <vnet/ip/ip6_hop_by_hop.h>
+
+ip6_hop_by_hop_main_t ip6_hop_by_hop_main;
+
+/*
+ * ip6 hop-by-hop option handling. We push pkts with h-b-h options to
+ * ip6_hop_by_hop_node_fn from ip6-lookup at a cost of ~2 clocks/pkt in
+ * the speed path.
+ *
+ * We parse through the h-b-h option TLVs, specifically looking for
+ * HBH_OPTION_TYPE_IOAM_DATA_LIST. [Someone needs to get bananas from
+ * IANA, aka to actually allocate the option TLV codes.]
+ *
+ * If we find the indicated option type, and we have remaining list
+ * elements in the trace list, allocate and populate the trace list
+ * element.
+ *
+ * At the ingress edge: punch in the h-b-h rewrite, then visit the
+ * standard h-b-h option handler. We have to be careful in the standard
+ * h-b-h handler, to avoid looping until we run out of rewrite space.
+ * Ask me how I know that.
+ *
+ * Remaining work:
+ * decide on egress point "pop and count" scheme
+ * time stamp handling: usec since the top of the hour?
+ * configure the node id
+ * trace list application data support
+ * cons up analysis / steering plug-in(s)
+ * add configuration binary APIs, vpe_api_test_support, yang models and
+ * orca code
+ * perf tune: dual loop, replace memcpy w/ N x 8-byte load/stores
+ *
+ */
+
+/*
+ * primary h-b-h handler trace support
+ * We work pretty hard on the problem for obvious reasons
+ */
+typedef struct {
+ u32 next_index;
+ u32 trace_len;
+ u8 option_data[256];
+} ip6_hop_by_hop_trace_t;
+
+static u8 * format_ioam_data_list_element (u8 * s, va_list * args)
+{
+ ioam_data_list_element_t *elt = va_arg (*args, ioam_data_list_element_t *);
+ u32 ttl_node_id_host_byte_order =
+ clib_net_to_host_u32 (elt->ttl_node_id);
+
+ s = format (s, "ttl %d node id %d ingress %d egress %d ts %u",
+ ttl_node_id_host_byte_order>>24,
+ ttl_node_id_host_byte_order & 0x00FFFFFF,
+ elt->ingress_if,
+ elt->egress_if,
+ elt->timestamp);
+ return s;
+}
+
+static u8 * format_ip6_hop_by_hop_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip6_hop_by_hop_trace_t * t = va_arg (*args, ip6_hop_by_hop_trace_t *);
+ ip6_hop_by_hop_header_t *hbh0;
+ ip6_hop_by_hop_option_t *opt0, *limit0;
+ ioam_trace_option_t * trace0;
+ ioam_data_list_element_t * elt0;
+ int elt_index;
+ u8 type0;
+
+ hbh0 = (ip6_hop_by_hop_header_t *)t->option_data;
+
+ s = format (s, "IP6_HOP_BY_HOP: next index %d len %d traced %d\n",
+ t->next_index, (hbh0->length+1)<<3, t->trace_len);
+
+ opt0 = (ip6_hop_by_hop_option_t *) (hbh0+1);
+ limit0 = (ip6_hop_by_hop_option_t *) ((u8 *)hbh0) + t->trace_len;
+
+ while (opt0 < limit0)
+ {
+ type0 = opt0->type & HBH_OPTION_TYPE_MASK;
+ elt_index = 0;
+ switch (type0)
+ {
+ case HBH_OPTION_TYPE_IOAM_DATA_LIST:
+ trace0 = (ioam_trace_option_t *)opt0;
+ s = format (s, " Trace %d elts left\n",
+ trace0->data_list_elts_left);
+ elt0 = &trace0->elts[0];
+ while ((u8 *) elt0 <
+ ((u8 *)(&trace0->elts[0]) + trace0->hdr.length - 1
+ /* -1 accounts for elts_left */))
+ {
+ s = format (s, " [%d] %U\n",elt_index,
+ format_ioam_data_list_element, elt0);
+ elt_index++;
+ elt0++;
+ }
+
+ opt0 = (ip6_hop_by_hop_option_t *)
+ (((u8 *)opt0) + opt0->length
+ + sizeof (ip6_hop_by_hop_option_t));
+ break;
+
+ case HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK:
+ s = format (s, " POW opt present\n");
+ opt0 = (ip6_hop_by_hop_option_t *)
+ (((u8 *)opt0) + sizeof (ioam_pow_option_t));
+ break;
+
+ case 0: /* Pad, just stop */
+ opt0 = (ip6_hop_by_hop_option_t *) ((u8 *)opt0) + 1;
+ break;
+
+ default:
+ s = format (s, "Unknown %d", type0);
+ opt0 = (ip6_hop_by_hop_option_t *)
+ (((u8 *)opt0) + opt0->length
+ + sizeof (ip6_hop_by_hop_option_t));
+ break;
+ }
+ }
+ return s;
+}
+
+vlib_node_registration_t ip6_hop_by_hop_node;
+
+#define foreach_ip6_hop_by_hop_error \
+_(PROCESSED, "Pkts with ip6 hop-by-hop options")
+
+typedef enum {
+#define _(sym,str) IP6_HOP_BY_HOP_ERROR_##sym,
+ foreach_ip6_hop_by_hop_error
+#undef _
+ IP6_HOP_BY_HOP_N_ERROR,
+} ip6_hop_by_hop_error_t;
+
+static char * ip6_hop_by_hop_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip6_hop_by_hop_error
+#undef _
+};
+
+static uword
+ip6_hop_by_hop_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main;
+ u32 n_left_from, * from, * to_next;
+ ip_lookup_next_t next_index;
+ u32 processed = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0 /* $$$ DUAL-LOOP ME */
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = IP6_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = IP6_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ Dual loop: process 2 x packets here $$$$$ */
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* $$$$$ End of processing 2 x packets $$$$$ */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 adj_index0;
+ ip6_header_t * ip0;
+ ip_adjacency_t * adj0;
+ ip6_hop_by_hop_header_t *hbh0;
+ ip6_hop_by_hop_option_t *opt0, *limit0;
+ ioam_trace_option_t * trace0;
+ ioam_data_list_element_t * elt0;
+ u8 type0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ hbh0 = (ip6_hop_by_hop_header_t *)(ip0+1);
+ opt0 = (ip6_hop_by_hop_option_t *)(hbh0+1);
+ limit0 = (ip6_hop_by_hop_option_t *)
+ ((u8 *)hbh0 + ((hbh0->length+1)<<3));
+
+ /* Scan the set of h-b-h options, process ones that we understand */
+ while (opt0 < limit0)
+ {
+ type0 = opt0->type & HBH_OPTION_TYPE_MASK;
+ switch (type0)
+ {
+ case HBH_OPTION_TYPE_IOAM_DATA_LIST:
+ trace0 = (ioam_trace_option_t *)opt0;
+ if (PREDICT_TRUE (trace0->data_list_elts_left))
+ {
+ trace0->data_list_elts_left--;
+ elt0 = &trace0->elts[trace0->data_list_elts_left];
+ elt0->ttl_node_id =
+ clib_host_to_net_u32 ((ip0->hop_limit<<24)
+ | hm->node_id);
+ elt0->ingress_if =
+ vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ elt0->egress_if = adj0->rewrite_header.sw_if_index;
+ elt0->timestamp = 123; /* $$$$ */
+ /* $$$ set elt0->app_data */
+ }
+
+ opt0 = (ip6_hop_by_hop_option_t *)
+ (((u8 *)opt0) + opt0->length
+ + sizeof (ip6_hop_by_hop_option_t));
+ break;
+
+ case HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK:
+ opt0 = (ip6_hop_by_hop_option_t *)
+ (((u8 *)opt0) + sizeof (ioam_pow_option_t));
+ break;
+
+ case 0: /* Pad */
+ opt0 = (ip6_hop_by_hop_option_t *) ((u8 *)opt0) + 1;
+ goto out0;
+ }
+ }
+
+ out0:
+
+ /*
+ * Since we push pkts here from the h-b-h header imposition code
+ * we have to be careful what we wish for...
+ */
+ next0 = adj0->lookup_next_index != IP_LOOKUP_NEXT_ADD_HOP_BY_HOP ?
+ adj0->lookup_next_index : adj0->saved_lookup_next_index;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip6_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ u32 trace_len = (hbh0->length+1)<<3;
+ t->next_index = next0;
+ /* Capture the h-b-h option verbatim */
+ trace_len = trace_len < ARRAY_LEN(t->option_data) ?
+ trace_len : ARRAY_LEN(t->option_data);
+ t->trace_len = trace_len;
+ memcpy (t->option_data, hbh0, trace_len);
+ }
+
+ processed++;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ip6_hop_by_hop_node.index,
+ IP6_HOP_BY_HOP_ERROR_PROCESSED, processed);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_hop_by_hop_node) = {
+ .function = ip6_hop_by_hop_node_fn,
+ .name = "ip6-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip6_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip6_hop_by_hop_error_strings),
+ .error_strings = ip6_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip6-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip6-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip6-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd",
+ /* Next 3 arcs probably never used */
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop",
+ },
+};
+
+/* The main h-b-h tracer will be invoked, no need to do much here */
+typedef struct {
+ u32 next_index;
+} ip6_add_hop_by_hop_trace_t;
+
+/* packet trace format function */
+static u8 * format_ip6_add_hop_by_hop_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip6_add_hop_by_hop_trace_t * t = va_arg (*args,
+ ip6_add_hop_by_hop_trace_t *);
+
+ s = format (s, "IP6_ADD_HOP_BY_HOP: next index %d",
+ t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ip6_add_hop_by_hop_node;
+
+#define foreach_ip6_add_hop_by_hop_error \
+_(PROCESSED, "Pkts w/ added ip6 hop-by-hop options")
+
+typedef enum {
+#define _(sym,str) IP6_ADD_HOP_BY_HOP_ERROR_##sym,
+ foreach_ip6_add_hop_by_hop_error
+#undef _
+ IP6_ADD_HOP_BY_HOP_N_ERROR,
+} ip6_add_hop_by_hop_error_t;
+
+static char * ip6_add_hop_by_hop_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip6_add_hop_by_hop_error
+#undef _
+};
+
+static uword
+ip6_add_hop_by_hop_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main;
+ u32 n_left_from, * from, * to_next;
+ ip_lookup_next_t next_index;
+ u32 processed = 0;
+ u8 * rewrite = hm->rewrite;
+ u32 rewrite_length = vec_len (rewrite);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = IP6_ADD_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = IP6_ADD_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ Dual loop: process 2 x packets here $$$$$ */
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* $$$$$ End of processing 2 x packets $$$$$ */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_add_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_add_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ ip6_header_t * ip0;
+ ip6_hop_by_hop_header_t * hbh0;
+ u64 * copy_src0, * copy_dst0;
+ u16 new_l0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ ip0 = vlib_buffer_get_current (b0);
+
+ /* Copy the ip header left by the required amount */
+ copy_dst0 = (u64 *)(((u8 *)ip0) - rewrite_length);
+ copy_src0 = (u64 *) ip0;
+
+ copy_dst0 [0] = copy_src0 [0];
+ copy_dst0 [1] = copy_src0 [1];
+ copy_dst0 [2] = copy_src0 [2];
+ copy_dst0 [3] = copy_src0 [3];
+ copy_dst0 [4] = copy_src0 [4];
+ vlib_buffer_advance (b0, - (word)rewrite_length);
+ ip0 = vlib_buffer_get_current (b0);
+
+ hbh0 = (ip6_hop_by_hop_header_t *)(ip0 + 1);
+ /* $$$ tune, rewrite_length is a multiple of 8 */
+ memcpy (hbh0, rewrite, rewrite_length);
+ /* Patch the protocol chain, insert the h-b-h (type 0) header */
+ hbh0->protocol = ip0->protocol;
+ ip0->protocol = 0;
+ new_l0 = clib_net_to_host_u16 (ip0->payload_length) + rewrite_length;
+ ip0->payload_length = clib_host_to_net_u16 (new_l0);
+
+ /* Populate the (first) h-b-h list elt */
+ next0 = IP_LOOKUP_NEXT_HOP_BY_HOP;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip6_add_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ }
+
+ processed++;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ip6_add_hop_by_hop_node.index,
+ IP6_ADD_HOP_BY_HOP_ERROR_PROCESSED, processed);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_add_hop_by_hop_node) = {
+ .function = ip6_add_hop_by_hop_node_fn,
+ .name = "ip6-add-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip6_add_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip6_add_hop_by_hop_error_strings),
+ .error_strings = ip6_add_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip6-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip6-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip6-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd",
+ /* Next 3 arcs probably never used */
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop",
+ },
+};
+
+
+/* The main h-b-h tracer was already invoked, no need to do much here */
+typedef struct {
+ u32 next_index;
+} ip6_pop_hop_by_hop_trace_t;
+
+/* packet trace format function */
+static u8 * format_ip6_pop_hop_by_hop_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip6_pop_hop_by_hop_trace_t * t = va_arg (*args, ip6_pop_hop_by_hop_trace_t *);
+
+ s = format (s, "IP6_POP_HOP_BY_HOP: next index %d",
+ t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ip6_pop_hop_by_hop_node;
+
+#define foreach_ip6_pop_hop_by_hop_error \
+_(PROCESSED, "Pkts w/ removed ip6 hop-by-hop options") \
+_(NO_HOHO, "Pkts w/ no ip6 hop-by-hop options")
+
+typedef enum {
+#define _(sym,str) IP6_POP_HOP_BY_HOP_ERROR_##sym,
+ foreach_ip6_pop_hop_by_hop_error
+#undef _
+ IP6_POP_HOP_BY_HOP_N_ERROR,
+} ip6_pop_hop_by_hop_error_t;
+
+static char * ip6_pop_hop_by_hop_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip6_pop_hop_by_hop_error
+#undef _
+};
+
+static uword
+ip6_pop_hop_by_hop_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 n_left_from, * from, * to_next;
+ ip_lookup_next_t next_index;
+ u32 processed = 0;
+ u32 no_header = 0;
+ u32 (*ioam_end_of_path_cb) (vlib_main_t *, vlib_node_runtime_t *,
+ vlib_buffer_t *, ip6_header_t *,
+ ip_adjacency_t *);
+
+ ioam_end_of_path_cb = hm->ioam_end_of_path_cb;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = IP6_POP_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = IP6_POP_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ Dual loop: process 2 x packets here $$$$$ */
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* $$$$$ End of processing 2 x packets $$$$$ */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_pop_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ ip6_pop_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 adj_index0;
+ ip6_header_t * ip0;
+ ip_adjacency_t * adj0;
+ ip6_hop_by_hop_header_t *hbh0;
+ u64 * copy_dst0, * copy_src0;
+ u16 new_l0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ ip0 = vlib_buffer_get_current (b0);
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+
+ /* Perfectly normal to end up here w/ out h-b-h header */
+ if (PREDICT_TRUE (ip0->protocol == 0))
+ {
+ hbh0 = (ip6_hop_by_hop_header_t *)(ip0+1);
+
+ /* Collect data from trace via callback */
+ next0 = ioam_end_of_path_cb ?
+ ioam_end_of_path_cb (vm, node, b0, ip0, adj0)
+ : adj0->saved_lookup_next_index;
+
+
+ /* Pop the trace data */
+ vlib_buffer_advance (b0, (hbh0->length+1)<<3);
+ new_l0 = clib_net_to_host_u16 (ip0->payload_length) -
+ ((hbh0->length+1)<<3);
+ ip0->payload_length = clib_host_to_net_u16 (new_l0);
+ ip0->protocol = hbh0->protocol;
+ copy_src0 = (u64 *)ip0;
+ copy_dst0 = copy_src0 + (hbh0->length+1);
+ copy_dst0 [4] = copy_src0[4];
+ copy_dst0 [3] = copy_src0[3];
+ copy_dst0 [2] = copy_src0[2];
+ copy_dst0 [1] = copy_src0[1];
+ copy_dst0 [0] = copy_src0[0];
+ processed++;
+ }
+ else
+ {
+ next0 = adj0->saved_lookup_next_index;
+ no_header++;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip6_pop_hop_by_hop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ip6_pop_hop_by_hop_node.index,
+ IP6_POP_HOP_BY_HOP_ERROR_PROCESSED, processed);
+ vlib_node_increment_counter (vm, ip6_pop_hop_by_hop_node.index,
+ IP6_POP_HOP_BY_HOP_ERROR_NO_HOHO, no_header);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_pop_hop_by_hop_node) = {
+ .function = ip6_pop_hop_by_hop_node_fn,
+ .name = "ip6-pop-hop-by-hop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip6_pop_hop_by_hop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ip6_pop_hop_by_hop_error_strings),
+ .error_strings = ip6_pop_hop_by_hop_error_strings,
+
+ /* See ip/lookup.h */
+ .n_next_nodes = IP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [IP_LOOKUP_NEXT_MISS] = "ip6-miss",
+ [IP_LOOKUP_NEXT_DROP] = "ip6-drop",
+ [IP_LOOKUP_NEXT_PUNT] = "ip6-punt",
+ [IP_LOOKUP_NEXT_LOCAL] = "ip6-local",
+ [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor",
+ [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite",
+ [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify",
+ [IP_LOOKUP_NEXT_MAP] = "ip6-map",
+ [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t",
+ [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd",
+ /* Next 3 arcs probably never used */
+ [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop",
+ [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop",
+ [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop",
+ },
+};
+
+
+static clib_error_t *
+ip6_hop_by_hop_init (vlib_main_t * vm)
+{
+ ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main;
+
+ hm->vlib_main = vm;
+ hm->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip6_hop_by_hop_init);
+
+int ip6_ioam_set_rewrite (u8 **rwp, u32 trace_option_elts, int has_pow_option)
+{
+ u8 *rewrite = 0;
+ u32 size, rnd_size;
+ ip6_hop_by_hop_header_t *hbh;
+ ioam_trace_option_t * trace_option;
+ ioam_pow_option_t * pow_option;
+ u8 *current;
+
+ vec_free (*rwp);
+
+ if (trace_option_elts == 0 && has_pow_option == 0)
+ return 0;
+
+ if (trace_option_elts * sizeof (ioam_data_list_element_t) > 254)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ /* Work out how much space we need */
+ size = sizeof (ip6_hop_by_hop_header_t);
+
+ if (trace_option_elts)
+ {
+ size += sizeof (ip6_hop_by_hop_option_t);
+ size += trace_option_elts * (sizeof (ioam_data_list_element_t));
+ }
+ if (has_pow_option)
+ {
+ size += sizeof (ip6_hop_by_hop_option_t);
+ size += sizeof (ioam_pow_option_t);
+ }
+
+ /* Round to a multiple of 8 octets */
+ rnd_size = (size + 7) & ~7;
+
+ /* allocate it, zero-fill / pad by construction */
+ vec_validate (rewrite, rnd_size-1);
+
+ hbh = (ip6_hop_by_hop_header_t *) rewrite;
+ /* Length of header in 8 octet units, not incl first 8 octets */
+ hbh->length = (rnd_size>>3) - 1;
+ current = (u8 *)(hbh+1);
+
+ if (trace_option_elts)
+ {
+ trace_option = (ioam_trace_option_t *)current;
+ trace_option->hdr.type = HBH_OPTION_TYPE_IOAM_DATA_LIST
+ | HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE;
+ trace_option->hdr.length = 1 /*data_list_elts_left */ +
+ trace_option_elts * sizeof (ioam_data_list_element_t);
+ trace_option->data_list_elts_left = trace_option_elts;
+ current += sizeof (ioam_trace_option_t) +
+ trace_option_elts * sizeof (ioam_data_list_element_t);
+ }
+ if (has_pow_option)
+ {
+ pow_option = (ioam_pow_option_t *)current;
+ pow_option->hdr.type = HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK
+ | HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE;
+ pow_option->hdr.length = sizeof (ioam_pow_option_t) -
+ sizeof (ip6_hop_by_hop_option_t);
+ current += sizeof (ioam_pow_option_t);
+ }
+
+ *rwp = rewrite;
+ return 0;
+}
+
+static clib_error_t *
+ip6_ioam_set_rewrite_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_hop_by_hop_main_t *hm = &ip6_hop_by_hop_main;
+ u32 trace_option_elts = 0;
+ int has_pow_option = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "trace-elts %d", &trace_option_elts))
+ ;
+ else if (unformat (input, "pow"))
+ has_pow_option = 1;
+ else
+ break;
+ }
+
+ rv = ip6_ioam_set_rewrite (&hm->rewrite, trace_option_elts, has_pow_option);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "ip6_ioam_set_rewrite returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ip6_ioam_set_rewrite_cmd, static) = {
+ .path = "ioam set rewrite",
+ .short_help = "ioam set rewrite [trace-elts <nn>] [pow]",
+ .function = ip6_ioam_set_rewrite_command_fn,
+};
+
+int ip6_ioam_set_destination (ip6_address_t *addr, u32 mask_width, u32 vrf_id,
+ int is_add, int is_pop, int is_none)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_adjacency_t * adj;
+ u32 fib_index;
+ u32 len, adj_index;
+ int i, rv;
+ uword * p;
+ BVT(clib_bihash_kv) kv, value;
+
+ if ((is_add + is_pop + is_none) != 1)
+ return VNET_API_ERROR_INVALID_VALUE_2;
+
+ /* Go find the adjacency we're supposed to tickle */
+ p = hash_get (im->fib_index_by_table_id, vrf_id);
+
+ if (p == 0)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ fib_index = p[0];
+
+ len = vec_len (im->prefix_lengths_in_search_order);
+
+ for (i = 0; i < len; i++)
+ {
+ int dst_address_length = im->prefix_lengths_in_search_order[i];
+ ip6_address_t * mask = &im->fib_masks[dst_address_length];
+
+ if (dst_address_length != mask_width)
+ continue;
+
+ kv.key[0] = addr->as_u64[0] & mask->as_u64[0];
+ kv.key[1] = addr->as_u64[1] & mask->as_u64[1];
+ kv.key[2] = ((u64)((fib_index))<<32) | dst_address_length;
+
+ rv = BV(clib_bihash_search_inline_2)(&im->ip6_lookup_table, &kv, &value);
+ if (rv == 0)
+ goto found;
+
+ }
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ found:
+
+ /* Got it, modify as directed... */
+ adj_index = value.value;
+ adj = ip_get_adjacency (lm, adj_index);
+
+ /* Restore original lookup-next action */
+ if (adj->saved_lookup_next_index)
+ {
+ adj->lookup_next_index = adj->saved_lookup_next_index;
+ adj->saved_lookup_next_index = 0;
+ }
+
+ /* Save current action */
+ if (is_add || is_pop)
+ adj->saved_lookup_next_index = adj->lookup_next_index;
+
+ if (is_add)
+ adj->lookup_next_index = IP_LOOKUP_NEXT_ADD_HOP_BY_HOP;
+
+ if (is_pop)
+ adj->lookup_next_index = IP_LOOKUP_NEXT_POP_HOP_BY_HOP;
+
+ return 0;
+}
+
+static clib_error_t *
+ip6_ioam_set_destination_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_address_t addr;
+ u32 mask_width = ~0;
+ int is_add = 0;
+ int is_pop = 0;
+ int is_none = 0;
+ u32 vrf_id = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U/%d",
+ unformat_ip6_address, &addr, &mask_width))
+ ;
+ else if (unformat (input, "vrf-id %d", &vrf_id))
+ ;
+ else if (unformat (input, "add"))
+ is_add = 1;
+ else if (unformat (input, "pop"))
+ is_pop = 1;
+ else if (unformat (input, "none"))
+ is_none = 1;
+ else
+ break;
+ }
+
+ if ((is_add + is_pop + is_none) != 1)
+ return clib_error_return (0, "One of (add, pop, none) required");
+ if (mask_width == ~0)
+ return clib_error_return (0, "<address>/<mask-width> required");
+
+ rv = ip6_ioam_set_destination (&addr, mask_width, vrf_id,
+ is_add, is_pop, is_none);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+ default:
+ return clib_error_return (0, "ip6_ioam_set_destination returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ip6_ioam_set_destination_cmd, static) = {
+ .path = "ioam set destination",
+ .short_help = "ioam set destination <ip6-address>/<width> add | pop | none",
+ .function = ip6_ioam_set_destination_command_fn,
+};
+
+void vnet_register_ioam_end_of_path_callback (void *cb)
+{
+ ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main;
+
+ hm->ioam_end_of_path_cb = cb;
+}
+
diff --git a/vnet/vnet/ip/ip6_hop_by_hop.h b/vnet/vnet/ip/ip6_hop_by_hop.h
new file mode 100644
index 00000000000..82bafc5777b
--- /dev/null
+++ b/vnet/vnet/ip/ip6_hop_by_hop.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ip6_hop_by_hop_h__
+#define __included_ip6_hop_by_hop_h__
+
+#include <vnet/ip/ip6_hop_by_hop_packet.h>
+
+typedef struct {
+ /* The current rewrite we're using */
+ u8 * rewrite;
+
+ /* Trace data processing callback */
+ void *ioam_end_of_path_cb;
+
+ /* Configured node-id */
+ u32 node_id;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} ip6_hop_by_hop_main_t;
+
+#endif /* __included_ip6_hop_by_hop_h__ */
diff --git a/vnet/vnet/ip/ip6_hop_by_hop_packet.h b/vnet/vnet/ip/ip6_hop_by_hop_packet.h
new file mode 100644
index 00000000000..a3d19035dae
--- /dev/null
+++ b/vnet/vnet/ip/ip6_hop_by_hop_packet.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ip6_hop_by_hop_packet_h__
+#define __included_ip6_hop_by_hop_packet_h__
+
+typedef struct {
+ /* Protocol for next header */
+ u8 protocol;
+ /*
+ * Length of hop_by_hop header in 8 octet units,
+ * not including the first 8 octets
+ */
+ u8 length;
+} ip6_hop_by_hop_header_t;
+
+typedef struct {
+ /* Option Type */
+#define HBH_OPTION_TYPE_SKIP_UNKNOWN (0x0 << 6)
+#define HBH_OPTION_TYPE_DISCARD_UNKNOWN (0x1 << 6)
+#define HBH_OPTION_TYPE_DISCARD_UNKNOWN_ICMP (0x2 << 6)
+#define HBH_OPTION_TYPE_DISCARD_UNKNOWN_ICMP_NOT_MCAST (0x3 << 6)
+#define HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE (1<<5)
+#define HBH_OPTION_TYPE_MASK (0x1F)
+ u8 type;
+ /* Length in octets of the option data field */
+ u8 length;
+} ip6_hop_by_hop_option_t;
+
+/* $$$$ IANA banana constants */
+#define HBH_OPTION_TYPE_IOAM_DATA_LIST 1
+#define HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK 2
+
+typedef struct {
+ u32 ttl_node_id;
+ u16 ingress_if;
+ u16 egress_if;
+ u32 timestamp;
+ u32 app_data;
+} ioam_data_list_element_t;
+
+typedef CLIB_PACKED(struct {
+ ip6_hop_by_hop_option_t hdr;
+ u8 data_list_elts_left;
+ ioam_data_list_element_t elts[0];
+}) ioam_trace_option_t;
+
+typedef CLIB_PACKED(struct {
+ ip6_hop_by_hop_option_t hdr;
+ u8 pow_type;
+ u8 reserved;
+ u32 random[2];
+ u32 cumulative[2];
+}) ioam_pow_option_t;
+
+#endif /* __included_ip6_hop_by_hop_packet_h__ */
diff --git a/vnet/vnet/ip/ip6_input.c b/vnet/vnet/ip/ip6_input.c
new file mode 100644
index 00000000000..ef8c7762625
--- /dev/null
+++ b/vnet/vnet/ip/ip6_input.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6_input.c: IP v6 input node
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ppp/ppp.h>
+#include <vnet/hdlc/hdlc.h>
+
+typedef struct {
+ u8 packet_data[64];
+} ip6_input_trace_t;
+
+static u8 * format_ip6_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ip6_input_trace_t * t = va_arg (*va, ip6_input_trace_t *);
+
+ s = format (s, "%U",
+ format_ip6_header,
+ t->packet_data, sizeof (t->packet_data));
+
+ return s;
+}
+
+typedef enum {
+ IP6_INPUT_NEXT_DROP,
+ IP6_INPUT_NEXT_LOOKUP,
+ IP6_INPUT_NEXT_TTL_EXPIRE,
+ IP6_INPUT_N_NEXT,
+} ip6_input_next_t;
+
+/* Validate IP v6 packets and pass them either to forwarding code
+ or drop exception packets. */
+static uword
+ip6_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 n_left_from, * from, * to_next;
+ ip6_input_next_t next_index;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index);
+ vlib_simple_counter_main_t * cm;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (ip6_input_trace_t));
+
+ cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
+ VNET_INTERFACE_COUNTER_IP6);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * p0, * p1;
+ ip6_header_t * ip0, * ip1;
+ ip_config_main_t * cm0, * cm1;
+ u32 pi0, sw_if_index0, next0;
+ u32 pi1, sw_if_index1, next1;
+ u8 error0, error1, cast0, cast1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD);
+ }
+
+ pi0 = from[0];
+ pi1 = from[1];
+
+ to_next[0] = pi0;
+ to_next[1] = pi1;
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+
+ ip0 = vlib_buffer_get_current (p0);
+ ip1 = vlib_buffer_get_current (p1);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX];
+
+ cast0 = ip6_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+ cast1 = ip6_address_is_multicast (&ip1->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+
+ cm0 = lm->rx_config_mains + cast0;
+ cm1 = lm->rx_config_mains + cast1;
+
+ vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0);
+ vnet_buffer (p1)->ip.current_config_index = vec_elt (cm1->config_index_by_sw_if_index, sw_if_index1);
+
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
+ vnet_buffer (p1)->ip.adj_index[VLIB_RX] = ~0;
+
+ vnet_get_config_data (&cm0->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ /* # bytes of config data */ 0);
+ vnet_get_config_data (&cm1->config_main,
+ &vnet_buffer (p1)->ip.current_config_index,
+ &next1,
+ /* # bytes of config data */ 0);
+
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+
+ error0 = error1 = IP6_ERROR_NONE;
+
+ /* Version != 6? Drop it. */
+ error0 = (clib_net_to_host_u32 (ip0->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error0;
+ error1 = (clib_net_to_host_u32 (ip1->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error1;
+
+ /* hop limit < 1? Drop it. for link-local broadcast packets,
+ * like dhcpv6 packets from client has hop-limit 1, which should not
+ * be dropped.
+ */
+ error0 = ip0->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error0;
+ error1 = ip1->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error1;
+
+ /* L2 length must be at least minimal IP header. */
+ error0 = p0->current_length < sizeof (ip0[0]) ? IP6_ERROR_TOO_SHORT : error0;
+ error1 = p1->current_length < sizeof (ip1[0]) ? IP6_ERROR_TOO_SHORT : error1;
+
+ if (PREDICT_FALSE(error0 != IP6_ERROR_NONE))
+ {
+ next0 = (error0 == IP6_ERROR_TIME_EXPIRED) ?
+ IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP;
+ }
+ if (PREDICT_FALSE(error1 != IP6_ERROR_NONE))
+ {
+ next1 = (error1 == IP6_ERROR_TIME_EXPIRED) ?
+ IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP;
+ }
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ ip_config_main_t * cm0;
+ u32 pi0, sw_if_index0, next0;
+ u8 error0, cast0;
+
+ pi0 = from[0];
+ to_next[0] = pi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, pi0);
+ ip0 = vlib_buffer_get_current (p0);
+
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ cast0 = ip6_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST;
+ cm0 = lm->rx_config_mains + cast0;
+ vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0);
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
+
+ vnet_get_config_data (&cm0->config_main,
+ &vnet_buffer (p0)->ip.current_config_index,
+ &next0,
+ /* # bytes of config data */ 0);
+
+ vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+ error0 = IP6_ERROR_NONE;
+
+ /* Version != 6? Drop it. */
+ error0 = (clib_net_to_host_u32 (ip0->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error0;
+
+ /* hop limit < 1? Drop it. for link-local broadcast packets,
+ * like dhcpv6 packets from client has hop-limit 1, which should not
+ * be dropped.
+ */
+ error0 = ip0->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error0;
+
+ /* L2 length must be at least minimal IP header. */
+ error0 = p0->current_length < sizeof (ip0[0]) ? IP6_ERROR_TOO_SHORT : error0;
+
+ if (PREDICT_FALSE(error0 != IP6_ERROR_NONE))
+ {
+ next0 = (error0 == IP6_ERROR_TIME_EXPIRED) ?
+ IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP;
+ }
+ p0->error = error_node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ pi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+static char * ip6_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip6_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (ip6_input_node) = {
+ .function = ip6_input,
+ .name = "ip6-input",
+ .vector_size = sizeof (u32),
+
+ .n_errors = IP6_N_ERROR,
+ .error_strings = ip6_error_strings,
+
+ .n_next_nodes = IP6_INPUT_N_NEXT,
+ .next_nodes = {
+ [IP6_INPUT_NEXT_DROP] = "error-drop",
+ [IP6_INPUT_NEXT_LOOKUP] = "ip6-lookup",
+ [IP6_INPUT_NEXT_TTL_EXPIRE] = "ip6-icmp-ttl-expire",
+ },
+
+ .format_buffer = format_ip6_header,
+ .format_trace = format_ip6_input_trace,
+};
+
+static clib_error_t * ip6_init (vlib_main_t * vm)
+{
+ ethernet_register_input_type (vm, ETHERNET_TYPE_IP6,
+ ip6_input_node.index);
+ ppp_register_input_protocol (vm, PPP_PROTOCOL_ip6,
+ ip6_input_node.index);
+ hdlc_register_input_protocol (vm, HDLC_PROTOCOL_ip6,
+ ip6_input_node.index);
+
+ {
+ pg_node_t * pn;
+ pn = pg_get_node (ip6_input_node.index);
+ pn->unformat_edit = unformat_pg_ip6_header;
+ }
+
+ /* Set flow hash to something non-zero. */
+ ip6_main.flow_hash_seed = 0xdeadbeef;
+
+ /* Default hop limit for packets we generate. */
+ ip6_main.host_config.ttl = 64;
+
+ return /* no error */ 0;
+}
+
+VLIB_INIT_FUNCTION (ip6_init);
diff --git a/vnet/vnet/ip/ip6_neighbor.c b/vnet/vnet/ip/ip6_neighbor.c
new file mode 100644
index 00000000000..28f964c804f
--- /dev/null
+++ b/vnet/vnet/ip/ip6_neighbor.c
@@ -0,0 +1,3146 @@
+/*
+ * ip/ip6_neighbor.c: IP6 neighbor handling
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/mhash.h>
+#include <vppinfra/md5.h>
+
+#if DPDK==1
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
+typedef struct {
+ ip6_address_t ip6_address;
+ u32 sw_if_index;
+ u32 pad;
+} ip6_neighbor_key_t;
+
+/* can't use sizeof link_layer_address, that's 8 */
+#define ETHER_MAC_ADDR_LEN 6
+
+typedef struct {
+ ip6_neighbor_key_t key;
+ u8 link_layer_address[8];
+ u64 cpu_time_last_updated;
+} ip6_neighbor_t;
+
+/* advertised prefix option */
+typedef struct {
+ /* basic advertised information */
+ ip6_address_t prefix;
+ u8 prefix_len;
+ int adv_on_link_flag;
+ int adv_autonomous_flag;
+ u32 adv_valid_lifetime_in_secs;
+ u32 adv_pref_lifetime_in_secs;
+
+ /* advertised values are computed from these times if decrementing */
+ f64 valid_lifetime_expires;
+ f64 pref_lifetime_expires;
+
+ /* local information */
+ int enabled;
+ int deprecated_prefix_flag;
+ int decrement_lifetime_flag;
+
+#define MIN_ADV_VALID_LIFETIME 7203 /* seconds */
+#define DEF_ADV_VALID_LIFETIME 2592000
+#define DEF_ADV_PREF_LIFETIME 604800
+
+ /* extensions are added here, mobile, DNS etc.. */
+} ip6_radv_prefix_t;
+
+
+typedef struct {
+ /* group information */
+ u8 type;
+ ip6_address_t mcast_address;
+ u16 num_sources;
+ ip6_address_t *mcast_source_address_pool;
+} ip6_mldp_group_t;
+
+/* configured router advertisement information per ipv6 interface */
+typedef struct {
+
+ /* advertised config information, zero means unspecified */
+ u8 curr_hop_limit;
+ int adv_managed_flag;
+ int adv_other_flag;
+ u16 adv_router_lifetime_in_sec;
+ u32 adv_neighbor_reachable_time_in_msec;
+ u32 adv_time_in_msec_between_retransmitted_neighbor_solicitations;
+
+ /* mtu option */
+ u32 adv_link_mtu;
+
+ /* source link layer option */
+ u8 link_layer_address[8];
+ u8 link_layer_addr_len;
+
+ /* prefix option */
+ ip6_radv_prefix_t * adv_prefixes_pool;
+
+ /* Hash table mapping address to index in interface advertised prefix pool. */
+ mhash_t address_to_prefix_index;
+
+ /* MLDP group information */
+ ip6_mldp_group_t * mldp_group_pool;
+
+ /* Hash table mapping address to index in mldp address pool. */
+ mhash_t address_to_mldp_index;
+
+ /* local information */
+ u32 sw_if_index;
+ u32 fib_index;
+ int send_radv; /* radv on/off on this interface - set by config */
+ int cease_radv; /* we are ceasing to send - set byf config */
+ int send_unicast;
+ int adv_link_layer_address;
+ int prefix_option;
+ int failed_device_check;
+ int all_routers_mcast;
+ u32 seed;
+ u64 randomizer;
+ int ref_count;
+ u32 all_nodes_adj_index;
+ u32 all_routers_adj_index;
+ u32 all_mldv2_routers_adj_index;
+
+ /* timing information */
+#define DEF_MAX_RADV_INTERVAL 200
+#define DEF_MIN_RADV_INTERVAL .75 * DEF_MAX_RADV_INTERVAL
+#define DEF_CURR_HOP_LIMIT 64
+#define DEF_DEF_RTR_LIFETIME 3 * DEF_MAX_RADV_INTERVAL
+#define MAX_DEF_RTR_LIFETIME 9000
+
+#define MAX_INITIAL_RTR_ADVERT_INTERVAL 16 /* seconds */
+#define MAX_INITIAL_RTR_ADVERTISEMENTS 3 /*transmissions */
+#define MIN_DELAY_BETWEEN_RAS 3 /* seconds */
+#define MAX_DELAY_BETWEEN_RAS 1800 /* seconds */
+#define MAX_RA_DELAY_TIME .5 /* seconds */
+
+ f64 max_radv_interval;
+ f64 min_radv_interval;
+ f64 min_delay_between_radv;
+ f64 max_delay_between_radv;
+ f64 max_rtr_default_lifetime;
+
+ f64 last_radv_time;
+ f64 last_multicast_time;
+ f64 next_multicast_time;
+
+
+ u32 initial_adverts_count;
+ f64 initial_adverts_interval;
+ u32 initial_adverts_sent;
+
+ /* stats */
+ u32 n_advertisements_sent;
+ u32 n_solicitations_rcvd;
+ u32 n_solicitations_dropped;
+
+ /* Link local address to use (defaults to underlying physical for logical interfaces */
+ ip6_address_t link_local_address;
+ u8 link_local_prefix_len;
+
+} ip6_radv_t;
+
+typedef struct {
+ u32 next_index;
+ uword node_index;
+ uword type_opaque;
+ uword data;
+} pending_resolution_t;
+
+
+typedef struct {
+ /* Hash tables mapping name to opcode. */
+ uword * opcode_by_name;
+
+ /* lite beer "glean" adjacency handling */
+ mhash_t pending_resolutions_by_address;
+ pending_resolution_t * pending_resolutions;
+
+ u32 * neighbor_input_next_index_by_hw_if_index;
+
+ ip6_neighbor_t * neighbor_pool;
+
+ mhash_t neighbor_index_by_key;
+
+ u32 * if_radv_pool_index_by_sw_if_index;
+
+ ip6_radv_t * if_radv_pool;
+
+ /* Neighbor attack mitigation */
+ u32 limit_neighbor_cache_size;
+ u32 neighbor_delete_rotor;
+
+} ip6_neighbor_main_t;
+
+static ip6_neighbor_main_t ip6_neighbor_main;
+
+static u8 * format_ip6_neighbor_ip6_entry (u8 * s, va_list * va)
+{
+ vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+ ip6_neighbor_t * n = va_arg (*va, ip6_neighbor_t *);
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_sw_interface_t * si;
+
+ if (! n)
+ return format (s, "%=12s%=20s%=20s%=40s", "Time", "Address", "Link layer", "Interface");
+
+ si = vnet_get_sw_interface (vnm, n->key.sw_if_index);
+ s = format (s, "%=12U%=20U%=20U%=40U",
+ format_vlib_cpu_time, vm, n->cpu_time_last_updated,
+ format_ip6_address, &n->key.ip6_address,
+ format_ethernet_address, n->link_layer_address,
+ format_vnet_sw_interface_name, vnm, si);
+
+ return s;
+}
+
+static clib_error_t *
+ip6_neighbor_sw_interface_up_down (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 flags)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_neighbor_t * n;
+
+ if (! (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ {
+ u32 i, * to_delete = 0;
+
+ pool_foreach (n, nm->neighbor_pool, ({
+ if (n->key.sw_if_index == sw_if_index)
+ vec_add1 (to_delete, n - nm->neighbor_pool);
+ }));
+
+ for (i = 0; i < vec_len (to_delete); i++)
+ {
+ n = pool_elt_at_index (nm->neighbor_pool, to_delete[i]);
+ mhash_unset (&nm->neighbor_index_by_key, &n->key, 0);
+ pool_put (nm->neighbor_pool, n);
+ }
+
+ vec_free (to_delete);
+ }
+
+ return 0;
+}
+
+VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip6_neighbor_sw_interface_up_down);
+
+static void unset_random_neighbor_entry (void)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = vnm->vlib_main;
+ ip6_neighbor_t * e;
+ u32 index;
+
+ index = pool_next_index (nm->neighbor_pool, nm->neighbor_delete_rotor);
+ nm->neighbor_delete_rotor = index;
+
+ /* Try again from elt 0, could happen if an intfc goes down */
+ if (index == ~0)
+ {
+ index = pool_next_index (nm->neighbor_pool, nm->neighbor_delete_rotor);
+ nm->neighbor_delete_rotor = index;
+ }
+
+ /* Nothing left in the pool */
+ if (index == ~0)
+ return;
+
+ e = pool_elt_at_index (nm->neighbor_pool, index);
+
+ vnet_unset_ip6_ethernet_neighbor (vm, e->key.sw_if_index,
+ &e->key.ip6_address,
+ e->link_layer_address,
+ ETHER_MAC_ADDR_LEN);
+}
+
+typedef struct {
+ u8 is_add;
+ u8 pad;
+ u8 link_layer_address[6];
+ u32 sw_if_index;
+ ip6_address_t addr;
+} ip6_neighbor_set_unset_rpc_args_t;
+
+static void ip6_neighbor_set_unset_rpc_callback
+( ip6_neighbor_set_unset_rpc_args_t * a);
+
+#if DPDK > 0
+static void set_unset_ip6_neighbor_rpc
+(vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * a,
+ u8 *link_layer_addreess,
+ int is_add)
+{
+ ip6_neighbor_set_unset_rpc_args_t args;
+
+ args.sw_if_index = sw_if_index;
+ args.is_add = is_add;
+ memcpy (&args.addr, a, sizeof (*a));
+ memcpy (args.link_layer_address, link_layer_addreess, 6);
+
+ vl_api_rpc_call_main_thread (ip6_neighbor_set_unset_rpc_callback,
+ (u8 *) &args, sizeof (args));
+}
+#endif
+
+int
+vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * a,
+ u8 * link_layer_address,
+ uword n_bytes_link_layer_address)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_neighbor_key_t k;
+ ip6_neighbor_t * n;
+ ip6_main_t * im = &ip6_main;
+ uword * p;
+ u32 next_index;
+ pending_resolution_t * pr;
+
+#if DPDK > 0
+ if (os_get_cpu_number())
+ {
+ set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address,
+ 1 /* set new neighbor */);
+ return 0;
+ }
+#endif
+
+ k.sw_if_index = sw_if_index;
+ k.ip6_address = a[0];
+ k.pad = 0;
+
+ vlib_worker_thread_barrier_sync (vm);
+
+ p = mhash_get (&nm->neighbor_index_by_key, &k);
+ if (p)
+ n = pool_elt_at_index (nm->neighbor_pool, p[0]);
+ else
+ {
+ ip6_add_del_route_args_t args;
+ ip_adjacency_t adj;
+
+ memset (&adj, 0, sizeof(adj));
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj.explicit_fib_index = ~0;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP6,
+ sw_if_index,
+ ip6_rewrite_node.index,
+ link_layer_address,
+ &adj.rewrite_header,
+ sizeof (adj.rewrite_data));
+
+ args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index];
+ args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_ADD | IP6_ROUTE_FLAG_NEIGHBOR;
+ args.dst_address = a[0];
+ args.dst_address_length = 128;
+ args.adj_index = ~0;
+ args.add_adj = &adj;
+ args.n_add_adj = 1;
+
+ ip6_add_del_route (im, &args);
+ pool_get (nm->neighbor_pool, n);
+ mhash_set (&nm->neighbor_index_by_key, &k, n - nm->neighbor_pool,
+ /* old value */ 0);
+ n->key = k;
+ }
+
+ /* Update time stamp and ethernet address. */
+ memcpy (n->link_layer_address, link_layer_address, n_bytes_link_layer_address);
+ n->cpu_time_last_updated = clib_cpu_time_now ();
+
+ /* Customer(s) waiting for this address to be resolved? */
+ p = mhash_get (&nm->pending_resolutions_by_address, a);
+ if (p == 0)
+ goto out;
+
+ next_index = p[0];
+
+ while (next_index != (u32)~0)
+ {
+ pr = pool_elt_at_index (nm->pending_resolutions, next_index);
+ vlib_process_signal_event (vm, pr->node_index,
+ pr->type_opaque,
+ pr->data);
+ next_index = pr->next_index;
+ pool_put (nm->pending_resolutions, pr);
+ }
+
+ mhash_unset (&nm->pending_resolutions_by_address, a, 0);
+
+out:
+ vlib_worker_thread_barrier_release(vm);
+ return 0;
+}
+
+int
+vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t * a,
+ u8 * link_layer_address,
+ uword n_bytes_link_layer_address)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_neighbor_key_t k;
+ ip6_neighbor_t * n;
+ ip6_main_t * im = &ip6_main;
+ ip6_add_del_route_args_t args;
+ uword * p;
+ int rv = 0;
+
+#if DPDK > 0
+ if (os_get_cpu_number())
+ {
+ set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address,
+ 0 /* unset */);
+ return 0;
+ }
+#endif
+
+ k.sw_if_index = sw_if_index;
+ k.ip6_address = a[0];
+ k.pad = 0;
+
+ vlib_worker_thread_barrier_sync (vm);
+
+ p = mhash_get (&nm->neighbor_index_by_key, &k);
+ if (p == 0)
+ {
+ rv = -1;
+ goto out;
+ }
+
+ n = pool_elt_at_index (nm->neighbor_pool, p[0]);
+ mhash_unset (&nm->neighbor_index_by_key, &n->key, 0);
+ pool_put (nm->neighbor_pool, n);
+
+ args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index];
+ args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_DEL
+ | IP6_ROUTE_FLAG_NEIGHBOR;
+ args.dst_address = a[0];
+ args.dst_address_length = 128;
+ args.adj_index = ~0;
+ args.add_adj = NULL;
+ args.n_add_adj = 0;
+ ip6_add_del_route (im, &args);
+ out:
+ vlib_worker_thread_barrier_release(vm);
+ return rv;
+}
+
+static void ip6_neighbor_set_unset_rpc_callback
+( ip6_neighbor_set_unset_rpc_args_t * a)
+{
+ vlib_main_t * vm = vlib_get_main();
+ if (a->is_add)
+ vnet_set_ip6_ethernet_neighbor (vm, a->sw_if_index, &a->addr,
+ a->link_layer_address, 6);
+ else
+ vnet_unset_ip6_ethernet_neighbor (vm, a->sw_if_index, &a->addr,
+ a->link_layer_address, 6);
+}
+
+static int
+ip6_neighbor_sort (void *a1, void *a2)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_t * n1 = a1, * n2 = a2;
+ int cmp;
+ cmp = vnet_sw_interface_compare (vnm, n1->key.sw_if_index,
+ n2->key.sw_if_index);
+ if (! cmp)
+ cmp = ip6_address_compare (&n1->key.ip6_address, &n2->key.ip6_address);
+ return cmp;
+}
+
+static clib_error_t *
+show_ip6_neighbors (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_neighbor_t * n, * ns;
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ /* Filter entries by interface if given. */
+ sw_if_index = ~0;
+ (void) unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index);
+
+ ns = 0;
+ pool_foreach (n, nm->neighbor_pool, ({ vec_add1 (ns, n[0]); }));
+ vec_sort_with_function (ns, ip6_neighbor_sort);
+ vlib_cli_output (vm, "%U", format_ip6_neighbor_ip6_entry, vm, 0);
+ vec_foreach (n, ns) {
+ if (sw_if_index != ~0 && n->key.sw_if_index != sw_if_index)
+ continue;
+ vlib_cli_output (vm, "%U", format_ip6_neighbor_ip6_entry, vm, n);
+ }
+ vec_free (ns);
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_ip6_neighbors_command, static) = {
+ .path = "show ip6 neighbors",
+ .function = show_ip6_neighbors,
+ .short_help = "Show ip6 neighbors",
+};
+
+static clib_error_t *
+set_ip6_neighbor (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_address_t addr;
+ u8 mac_address[6];
+ int addr_valid = 0;
+ int is_del = 0;
+ u32 sw_if_index;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ /* intfc, ip6-address, mac-address */
+ if (unformat (input, "%U %U %U",
+ unformat_vnet_sw_interface, vnm, &sw_if_index,
+ unformat_ip6_address, &addr,
+ unformat_ethernet_address, mac_address))
+ addr_valid = 1;
+
+ else if (unformat (input, "delete") || unformat (input, "del"))
+ is_del = 1;
+ else
+ break;
+ }
+
+ if (!addr_valid)
+ return clib_error_return (0, "Missing interface, ip6 or hw address");
+
+ if (!is_del)
+ vnet_set_ip6_ethernet_neighbor (vm, sw_if_index, &addr,
+ mac_address, sizeof(mac_address));
+ else
+ vnet_unset_ip6_ethernet_neighbor (vm, sw_if_index, &addr,
+ mac_address, sizeof(mac_address));
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip6_neighbor_command, static) = {
+ .path = "set ip6 neighbor",
+ .function = set_ip6_neighbor,
+ .short_help = "set ip6 neighbor [del] <intfc> <ip6-address> <mac-address>",
+};
+
+typedef enum {
+ ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP,
+ ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY,
+ ICMP6_NEIGHBOR_SOLICITATION_N_NEXT,
+} icmp6_neighbor_solicitation_or_advertisement_next_t;
+
+static_always_inline uword
+icmp6_neighbor_solicitation_or_advertisement (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword is_solicitation)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next_index, n_advertisements_sent;
+ icmp6_neighbor_discovery_option_type_t option_type;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index);
+ int bogus_length;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp6_input_trace_t));
+
+ option_type =
+ (is_solicitation
+ ? ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address
+ : ICMP6_NEIGHBOR_DISCOVERY_OPTION_target_link_layer_address);
+ n_advertisements_sent = 0;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ icmp6_neighbor_solicitation_or_advertisement_header_t * h0;
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t * o0;
+ u32 bi0, options_len0, sw_if_index0, next0, error0;
+ u32 ip6_sadd_link_local, ip6_sadd_unspecified;
+ int is_rewrite0;
+ u32 ni0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ h0 = ip6_next_header (ip0);
+ options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]);
+
+ error0 = ICMP6_ERROR_NONE;
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+ ip6_sadd_link_local = ip6_address_is_link_local_unicast(&ip0->src_address);
+ ip6_sadd_unspecified = ip6_address_is_unspecified (&ip0->src_address);
+
+ /* Check that source address is unspecified, link-local or else on-link. */
+ if (!ip6_sadd_unspecified && !ip6_sadd_link_local)
+ {
+ u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
+ ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0);
+
+ /* Allow all realistic-looking rewrite adjacencies to pass */
+ ni0 = adj0->lookup_next_index;
+ is_rewrite0 = (ni0 >= IP_LOOKUP_NEXT_ARP) &&
+ (ni0 < IP_LOOKUP_N_NEXT);
+
+ error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0
+ || ! is_rewrite0)
+ ? ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK
+ : error0);
+ }
+
+ o0 = (void *) (h0 + 1);
+ o0 = ((options_len0 == 8 && o0->header.type == option_type
+ && o0->header.n_data_u64s == 1) ? o0 : 0);
+
+ /* If src address unspecified or link local, donot learn neighbor MAC */
+ if (PREDICT_TRUE (error0 == ICMP6_ERROR_NONE && o0 != 0 &&
+ !ip6_sadd_unspecified && !ip6_sadd_link_local))
+ {
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ if (nm->limit_neighbor_cache_size &&
+ pool_elts (nm->neighbor_pool) >= nm->limit_neighbor_cache_size)
+ unset_random_neighbor_entry();
+ vnet_set_ip6_ethernet_neighbor (
+ vm, sw_if_index0,
+ is_solicitation ? &ip0->src_address : &h0->target_address,
+ o0->ethernet_address, sizeof (o0->ethernet_address));
+ }
+
+ if (is_solicitation && error0 == ICMP6_ERROR_NONE)
+ {
+ /* Check that target address is one that we know about. */
+ ip_interface_address_t * ia0;
+ ip6_address_fib_t ip6_af0;
+ void * oldheap;
+
+ ip6_addr_fib_init (&ip6_af0, &h0->target_address,
+ vec_elt (im->fib_index_by_sw_if_index,
+ sw_if_index0));
+
+ /* Gross kludge, "thank you" MJ, don't even ask */
+ oldheap = clib_mem_set_heap (clib_per_cpu_mheaps[0]);
+ ia0 = ip_get_interface_address (lm, &ip6_af0);
+ clib_mem_set_heap (oldheap);
+ error0 = ia0 == 0 ?
+ ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN : error0;
+ }
+
+ if (is_solicitation)
+ next0 = (error0 != ICMP6_ERROR_NONE
+ ? ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP
+ : ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY);
+ else
+ {
+ next0 = 0;
+ error0 = error0 == ICMP6_ERROR_NONE ?
+ ICMP6_ERROR_NEIGHBOR_ADVERTISEMENTS_RX : error0;
+ }
+
+ if (is_solicitation && error0 == ICMP6_ERROR_NONE)
+ {
+ vnet_sw_interface_t * sw_if0;
+ ethernet_interface_t * eth_if0;
+ ethernet_header_t *eth0;
+
+ /* dst address is either source address or the all-nodes mcast addr */
+ if(!ip6_sadd_unspecified)
+ ip0->dst_address = ip0->src_address;
+ else
+ ip6_set_reserved_multicast_address(&ip0->dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_hosts);
+
+ ip0->src_address = h0->target_address;
+ ip0->hop_limit = 255;
+ h0->icmp.type = ICMP6_neighbor_advertisement;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0);
+ ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+ if (eth_if0 && o0)
+ {
+ memcpy (o0->ethernet_address, eth_if0->address, 6);
+ o0->header.type =
+ ICMP6_NEIGHBOR_DISCOVERY_OPTION_target_link_layer_address;
+ }
+
+ h0->advertisement_flags = clib_host_to_net_u32
+ (ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_SOLICITED
+ | ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_OVERRIDE);
+
+ h0->icmp.checksum = 0;
+ h0->icmp.checksum =
+ ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0,
+ &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ /* Reuse current MAC header, copy SMAC to DMAC and
+ * interface MAC to SMAC */
+ vlib_buffer_reset (p0);
+ eth0 = vlib_buffer_get_current(p0);
+ memcpy(eth0->dst_address, eth0->src_address, 6);
+ memcpy(eth0->src_address, eth_if0->address, 6);
+
+ /* Setup input and output sw_if_index for packet */
+ ASSERT(vnet_buffer(p0)->sw_if_index[VLIB_RX] == sw_if_index0);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ vnet_buffer(p0)->sw_if_index[VLIB_RX] =
+ vnet_main.local_interface_sw_if_index;
+
+ n_advertisements_sent++;
+ }
+
+ p0->error = error_node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Account for advertisements sent. */
+ vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_NEIGHBOR_ADVERTISEMENTS_TX, n_advertisements_sent);
+
+ return frame->n_vectors;
+}
+
+/* for "syslogging" - use elog for now */
+#define foreach_log_level \
+ _ (DEBUG, "DEBUG") \
+ _ (INFO, "INFORMATION") \
+ _ (NOTICE, "NOTICE") \
+ _ (WARNING, "WARNING") \
+ _ (ERR, "ERROR") \
+ _ (CRIT, "CRITICAL") \
+ _ (ALERT, "ALERT") \
+ _ (EMERG, "EMERGENCY")
+
+typedef enum {
+#define _(f,s) LOG_##f,
+ foreach_log_level
+#undef _
+} log_level_t;
+
+static char * log_level_strings[] = {
+#define _(f,s) s,
+ foreach_log_level
+#undef _
+};
+
+static int logmask = 1 << LOG_DEBUG;
+
+static void
+ip6_neighbor_syslog(vlib_main_t *vm, int priority, char * fmt, ...)
+{
+ /* just use elog for now */
+ u8 *what;
+ va_list va;
+
+ if( (priority > LOG_EMERG) ||
+ !(logmask & (1 << priority)))
+ return;
+
+ va_start (va, fmt);
+ if(fmt)
+ {
+ what = va_format (0, fmt, &va);
+
+ ELOG_TYPE_DECLARE (e) = {
+ .format = "ip6 nd: (%s): %s",
+ .format_args = "T4T4",
+ };
+ struct { u32 s[2]; } * ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->s[0] = elog_string(&vm->elog_main, log_level_strings[priority]);
+ ed->s[1] = elog_string(&vm->elog_main, (char *)what);
+ }
+ va_end (va);
+ return;
+}
+
+/* ipv6 neighbor discovery - router advertisements */
+typedef enum {
+ ICMP6_ROUTER_SOLICITATION_NEXT_DROP,
+ ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW,
+ ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX,
+ ICMP6_ROUTER_SOLICITATION_N_NEXT,
+} icmp6_router_solicitation_or_advertisement_next_t;
+
+static_always_inline uword
+icmp6_router_solicitation(vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im = &ip6_main;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next_index;
+ u32 n_advertisements_sent = 0;
+ int bogus_length;
+
+ icmp6_neighbor_discovery_option_type_t option_type;
+
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp6_input_trace_t));
+
+ /* source may append his LL address */
+ option_type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ ip6_radv_t *radv_info = 0;
+
+ icmp6_neighbor_discovery_header_t * h0;
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t * o0;
+
+ u32 bi0, options_len0, sw_if_index0, next0, error0;
+ u32 is_solicitation = 1, is_dropped = 0;
+ u32 is_unspecified, is_link_local;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ h0 = ip6_next_header (ip0);
+ options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]);
+ is_unspecified = ip6_address_is_unspecified (&ip0->src_address);
+ is_link_local = ip6_address_is_link_local_unicast (&ip0->src_address);
+
+ error0 = ICMP6_ERROR_NONE;
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+
+ /* check if solicitation (not from nd_timer node) */
+ if (ip6_address_is_unspecified (&ip0->dst_address))
+ is_solicitation = 0;
+
+ /* Check that source address is unspecified, link-local or else on-link. */
+ if (!is_unspecified && !is_link_local)
+ {
+ u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0);
+ ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0);
+
+ error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0
+ || (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP
+ && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE))
+ ? ICMP6_ERROR_ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK
+ : error0);
+ }
+
+ /* check for source LL option and process */
+ o0 = (void *) (h0 + 1);
+ o0 = ((options_len0 == 8
+ && o0->header.type == option_type
+ && o0->header.n_data_u64s == 1)
+ ? o0
+ : 0);
+
+ /* if src address unspecified IGNORE any options */
+ if (PREDICT_TRUE (error0 == ICMP6_ERROR_NONE && o0 != 0 &&
+ !is_unspecified && !is_link_local)) {
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ if (nm->limit_neighbor_cache_size &&
+ pool_elts (nm->neighbor_pool) >= nm->limit_neighbor_cache_size)
+ unset_random_neighbor_entry();
+
+ vnet_set_ip6_ethernet_neighbor (vm, sw_if_index0,
+ &ip0->src_address,
+ o0->ethernet_address,
+ sizeof (o0->ethernet_address));
+ }
+
+ /* default is to drop */
+ next0 = ICMP6_ROUTER_SOLICITATION_NEXT_DROP;
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ vnet_sw_interface_t * sw_if0;
+ ethernet_interface_t * eth_if0;
+ u32 adj_index0;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0);
+ ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+
+ /* only support ethernet interface type for now */
+ error0 = (!eth_if0) ? ICMP6_ERROR_ROUTER_SOLICITATION_UNSUPPORTED_INTF : error0;
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ u32 ri;
+
+ /* adjust the sizeof the buffer to just include the ipv6 header */
+ p0->current_length -= (options_len0 + sizeof(icmp6_neighbor_discovery_header_t));
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index0, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index0];
+
+ if(ri != ~0)
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ error0 = ((!radv_info) ? ICMP6_ERROR_ROUTER_SOLICITATION_RADV_NOT_CONFIG : error0);
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ f64 now = vlib_time_now (vm);
+
+ /* for solicited adverts - need to rate limit */
+ if(is_solicitation)
+ {
+ if( (now - radv_info->last_radv_time) < MIN_DELAY_BETWEEN_RAS )
+ is_dropped = 1;
+ else
+ radv_info->last_radv_time = now;
+ }
+
+ /* send now */
+ icmp6_router_advertisement_header_t rh;
+
+ rh.icmp.type = ICMP6_router_advertisement;
+ rh.icmp.code = 0;
+ rh.icmp.checksum = 0;
+
+ rh.current_hop_limit = radv_info->curr_hop_limit;
+ rh.router_lifetime_in_sec = clib_host_to_net_u16(radv_info->adv_router_lifetime_in_sec);
+ rh.time_in_msec_between_retransmitted_neighbor_solicitations =
+ clib_host_to_net_u32(radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations);
+ rh.neighbor_reachable_time_in_msec =
+ clib_host_to_net_u32(radv_info->adv_neighbor_reachable_time_in_msec);
+
+ rh.flags = (radv_info->adv_managed_flag) ? ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP : 0;
+ rh.flags |= ( (radv_info->adv_other_flag) ? ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP : 0);
+
+
+ u16 payload_length = sizeof(icmp6_router_advertisement_header_t);
+
+ vlib_buffer_add_data (vm,
+ p0->free_list_index,
+ bi0,
+ (void *)&rh, sizeof(icmp6_router_advertisement_header_t));
+
+ if(radv_info->adv_link_layer_address)
+ {
+ icmp6_neighbor_discovery_ethernet_link_layer_address_option_t h;
+
+ h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address;
+ h.header.n_data_u64s = 1;
+
+ /* copy ll address */
+ memcpy(&h.ethernet_address[0], eth_if0->address, 6);
+
+ vlib_buffer_add_data (vm,
+ p0->free_list_index,
+ bi0,
+ (void *)&h, sizeof(icmp6_neighbor_discovery_ethernet_link_layer_address_option_t));
+
+ payload_length += sizeof(icmp6_neighbor_discovery_ethernet_link_layer_address_option_t);
+ }
+
+ /* add MTU option */
+ if(radv_info->adv_link_mtu)
+ {
+ icmp6_neighbor_discovery_mtu_option_t h;
+
+ h.unused = 0;
+ h.mtu = clib_host_to_net_u32(radv_info->adv_link_mtu);
+ h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_mtu;
+ h.header.n_data_u64s = 1;
+
+ payload_length += sizeof( icmp6_neighbor_discovery_mtu_option_t);
+
+ vlib_buffer_add_data (vm,
+ p0->free_list_index,
+ bi0,
+ (void *)&h, sizeof(icmp6_neighbor_discovery_mtu_option_t));
+ }
+
+ /* add advertised prefix options */
+ ip6_radv_prefix_t *pr_info;
+
+ pool_foreach (pr_info, radv_info->adv_prefixes_pool, ({
+
+ if(pr_info->enabled &&
+ (!pr_info->decrement_lifetime_flag || (pr_info->pref_lifetime_expires >0)))
+ {
+ /* advertise this prefix */
+ icmp6_neighbor_discovery_prefix_information_option_t h;
+
+ h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_prefix_information;
+ h.header.n_data_u64s = (sizeof(icmp6_neighbor_discovery_prefix_information_option_t) >> 3);
+
+ h.dst_address_length = pr_info->prefix_len;
+
+ h.flags = (pr_info->adv_on_link_flag) ? ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_FLAG_ON_LINK : 0;
+ h.flags |= (pr_info->adv_autonomous_flag) ? ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_AUTO : 0;
+
+ if(radv_info->cease_radv && pr_info->deprecated_prefix_flag)
+ {
+ h.valid_time = clib_host_to_net_u32(MIN_ADV_VALID_LIFETIME);
+ h.preferred_time = 0;
+ }
+ else
+ {
+ if(pr_info->decrement_lifetime_flag)
+ {
+ pr_info->adv_valid_lifetime_in_secs = ((pr_info->valid_lifetime_expires > now)) ?
+ (pr_info->valid_lifetime_expires - now) : 0;
+
+ pr_info->adv_pref_lifetime_in_secs = ((pr_info->pref_lifetime_expires > now)) ?
+ (pr_info->pref_lifetime_expires - now) : 0;
+ }
+
+ h.valid_time = clib_host_to_net_u32(pr_info->adv_valid_lifetime_in_secs);
+ h.preferred_time = clib_host_to_net_u32(pr_info->adv_pref_lifetime_in_secs) ;
+ }
+ h.unused = 0;
+
+ memcpy(&h.dst_address, &pr_info->prefix, sizeof(ip6_address_t));
+
+ payload_length += sizeof( icmp6_neighbor_discovery_prefix_information_option_t);
+
+ vlib_buffer_add_data (vm,
+ p0->free_list_index,
+ bi0,
+ (void *)&h, sizeof(icmp6_neighbor_discovery_prefix_information_option_t));
+
+ }
+ }));
+
+ /* add additional options before here */
+
+ /* finish building the router advertisement... */
+ if(!is_unspecified && radv_info->send_unicast)
+ {
+ ip0->dst_address = ip0->src_address;
+ }
+ else
+ {
+ /* target address is all-nodes mcast addr */
+ ip6_set_reserved_multicast_address(&ip0->dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_hosts);
+ }
+
+ /* source address MUST be the link-local address */
+ ip0->src_address = radv_info->link_local_address;
+
+ ip0->hop_limit = 255;
+ ip0->payload_length = clib_host_to_net_u16 (payload_length);
+
+ icmp6_router_advertisement_header_t * rh0 = (icmp6_router_advertisement_header_t *)(ip0 + 1);
+ rh0->icmp.checksum =
+ ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0,
+ &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ /* setup output if and adjacency */
+ vnet_buffer (p0)->sw_if_index[VLIB_RX] =
+ vnet_main.local_interface_sw_if_index;
+
+ if (is_solicitation)
+ {
+ ethernet_header_t *eth0;
+ /* Reuse current MAC header, copy SMAC to DMAC and
+ * interface MAC to SMAC */
+ vlib_buffer_reset (p0);
+ eth0 = vlib_buffer_get_current(p0);
+ memcpy(eth0->dst_address, eth0->src_address, 6);
+ memcpy(eth0->src_address, eth_if0->address, 6);
+ next0 = is_dropped ?
+ next0 : ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX;
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ }
+ else
+ {
+ adj_index0 = radv_info->all_nodes_adj_index;
+ if (adj_index0 == 0)
+ error0 = ICMP6_ERROR_DST_LOOKUP_MISS;
+ else
+ {
+ ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, adj_index0);
+ error0 =
+ ((adj0->rewrite_header.sw_if_index != sw_if_index0
+ || adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)
+ ? ICMP6_ERROR_ROUTER_SOLICITATION_DEST_UNKNOWN
+ : error0);
+ next0 = is_dropped ?
+ next0 : ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW;
+ vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0;
+ }
+ }
+
+ radv_info->n_solicitations_dropped += is_dropped;
+ radv_info->n_solicitations_rcvd += is_solicitation;
+
+ if((error0 == ICMP6_ERROR_NONE) && !is_dropped)
+ {
+ radv_info->n_advertisements_sent++;
+ n_advertisements_sent++;
+ }
+ }
+ }
+ }
+
+ p0->error = error_node->errors[error0];
+
+ if(error0 != ICMP6_ERROR_NONE)
+ vlib_error_count (vm, error_node->node_index, error0, 1);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Account for router advertisements sent. */
+ vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_ROUTER_ADVERTISEMENTS_TX, n_advertisements_sent);
+
+ return frame->n_vectors;
+}
+
+ /* validate advertised info for consistancy (see RFC-4861 section 6.2.7) - log any inconsistencies, packet will always be dropped */
+static_always_inline uword
+icmp6_router_advertisement(vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next_index;
+ u32 n_advertisements_rcvd = 0;
+
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next_index = node->cached_next_index;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+ /* stride */ 1,
+ sizeof (icmp6_input_trace_t));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+ ip6_radv_t *radv_info = 0;
+ icmp6_router_advertisement_header_t * h0;
+ u32 bi0, options_len0, sw_if_index0, next0, error0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (p0);
+ h0 = ip6_next_header (ip0);
+ options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]);
+
+ error0 = ICMP6_ERROR_NONE;
+ sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX];
+
+ /* Check that source address is link-local*/
+ error0 = (!ip6_address_is_link_local_unicast (&ip0->src_address)) ?
+ ICMP6_ERROR_ROUTER_ADVERTISEMENT_SOURCE_NOT_LINK_LOCAL : error0;
+
+ /* default is to drop */
+ next0 = ICMP6_ROUTER_SOLICITATION_NEXT_DROP;
+
+ n_advertisements_rcvd++;
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ vnet_sw_interface_t * sw_if0;
+ ethernet_interface_t * eth_if0;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0);
+ ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+
+ /* only support ethernet interface type for now */
+ error0 = (!eth_if0) ? ICMP6_ERROR_ROUTER_SOLICITATION_UNSUPPORTED_INTF : error0;
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ u32 ri;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index0, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index0];
+
+ if(ri != ~0)
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ error0 = ((!radv_info) ? ICMP6_ERROR_ROUTER_SOLICITATION_RADV_NOT_CONFIG : error0);
+
+ if (error0 == ICMP6_ERROR_NONE)
+ {
+ /* validate advertised information */
+ if((h0->current_hop_limit && radv_info->curr_hop_limit) &&
+ (h0->current_hop_limit != radv_info->curr_hop_limit))
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvCurHopLimit on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+
+ if((h0->flags & ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP) !=
+ radv_info->adv_managed_flag)
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvManagedFlag on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+
+ if((h0->flags & ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP) !=
+ radv_info->adv_other_flag)
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvOtherConfigFlag on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+
+ if((h0->time_in_msec_between_retransmitted_neighbor_solicitations &&
+ radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations) &&
+ (h0->time_in_msec_between_retransmitted_neighbor_solicitations !=
+ clib_host_to_net_u32(radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations)))
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvRetransTimer on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+
+ if((h0->neighbor_reachable_time_in_msec &&
+ radv_info->adv_neighbor_reachable_time_in_msec) &&
+ (h0->neighbor_reachable_time_in_msec !=
+ clib_host_to_net_u32(radv_info->adv_neighbor_reachable_time_in_msec)))
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvReachableTime on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+
+ /* check for MTU or prefix options or .. */
+ u8 * opt_hdr = (u8 *)(h0 + 1);
+ while( options_len0 > 0)
+ {
+ icmp6_neighbor_discovery_option_header_t *o0 = ( icmp6_neighbor_discovery_option_header_t *)opt_hdr;
+ int opt_len = o0->n_data_u64s << 3;
+ icmp6_neighbor_discovery_option_type_t option_type = o0->type;
+
+ if(options_len0 < 2)
+ {
+ ip6_neighbor_syslog(vm, LOG_ERR,
+ "malformed RA packet on %U from %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ break;
+ }
+
+ if(opt_len == 0)
+ {
+ ip6_neighbor_syslog(vm, LOG_ERR,
+ " zero length option in RA on %U from %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ break;
+ }
+ else if( opt_len > options_len0)
+ {
+ ip6_neighbor_syslog(vm, LOG_ERR,
+ "option length in RA packet greater than total length on %U from %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ break;
+ }
+
+ options_len0 -= opt_len;
+ opt_hdr += opt_len;
+
+ switch(option_type)
+ {
+ case ICMP6_NEIGHBOR_DISCOVERY_OPTION_mtu:
+ {
+ icmp6_neighbor_discovery_mtu_option_t *h =
+ (icmp6_neighbor_discovery_mtu_option_t *)(o0);
+
+ if(opt_len < sizeof(*h))
+ break;
+
+ if((h->mtu && radv_info->adv_link_mtu) &&
+ (h->mtu != clib_host_to_net_u32(radv_info->adv_link_mtu)))
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our AdvLinkMTU on %U doesn't agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address);
+ }
+ }
+ break;
+
+ case ICMP6_NEIGHBOR_DISCOVERY_OPTION_prefix_information:
+ {
+ icmp6_neighbor_discovery_prefix_information_option_t *h =
+ (icmp6_neighbor_discovery_prefix_information_option_t *)(o0);
+
+ /* validate advertised prefix options */
+ ip6_radv_prefix_t *pr_info;
+ u32 preferred, valid;
+
+ if(opt_len < sizeof(*h))
+ break;
+
+ preferred = clib_net_to_host_u32(h->preferred_time);
+ valid = clib_net_to_host_u32(h->valid_time);
+
+ /* look for matching prefix - if we our advertising it, it better be consistant */
+ pool_foreach (pr_info, radv_info->adv_prefixes_pool, ({
+
+ ip6_address_t mask;
+ ip6_address_mask_from_width(&mask, pr_info->prefix_len);
+
+ if(pr_info->enabled &&
+ (pr_info->prefix_len == h->dst_address_length) &&
+ ip6_address_is_equal_masked (&pr_info->prefix, &h->dst_address, &mask))
+ {
+ /* found it */
+ if(!pr_info->decrement_lifetime_flag &&
+ valid != pr_info->adv_valid_lifetime_in_secs)
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our ADV validlifetime on %U for %U does not agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0,format_ip6_address, &pr_info->prefix,
+ format_ip6_address, &h->dst_address);
+ }
+ if(!pr_info->decrement_lifetime_flag &&
+ preferred != pr_info->adv_pref_lifetime_in_secs)
+ {
+ ip6_neighbor_syslog(vm, LOG_WARNING,
+ "our ADV preferredlifetime on %U for %U does not agree with %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index0,format_ip6_address, &pr_info->prefix,
+ format_ip6_address, &h->dst_address);
+ }
+ }
+ break;
+ }));
+ break;
+ }
+ default:
+ /* skip this one */
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ p0->error = error_node->errors[error0];
+
+ if(error0 != ICMP6_ERROR_NONE)
+ vlib_error_count (vm, error_node->node_index, error0, 1);
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Account for router advertisements sent. */
+ vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_ROUTER_ADVERTISEMENTS_RX, n_advertisements_rcvd);
+
+ return frame->n_vectors;
+}
+
+/* create and initialize router advertisement parameters with default values for this intfc */
+static u32
+ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 is_add)
+{
+ ip6_main_t * im = &ip6_main;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_radv_t * a= 0;
+ u32 ri = ~0;;
+ vnet_sw_interface_t * sw_if0;
+ ethernet_interface_t * eth_if0 = 0;
+
+ /* lookup radv container - ethernet interfaces only */
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+
+ if(!eth_if0)
+ return ri;
+
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ if(ri != ~0)
+ {
+ a = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ if(!is_add)
+ {
+ u32 i, * to_delete = 0;
+ ip6_radv_prefix_t *p;
+ ip6_mldp_group_t *m;
+
+ /* remove adjacencies */
+ ip_del_adjacency (lm, a->all_nodes_adj_index);
+ ip_del_adjacency (lm, a->all_routers_adj_index);
+ ip_del_adjacency (lm, a->all_mldv2_routers_adj_index);
+
+ /* clean up prefix_pool */
+ pool_foreach (p, a->adv_prefixes_pool, ({
+ vec_add1 (to_delete, p - a->adv_prefixes_pool);
+ }));
+
+ for (i = 0; i < vec_len (to_delete); i++)
+ {
+ p = pool_elt_at_index (a->adv_prefixes_pool, to_delete[i]);
+ mhash_unset (&a->address_to_prefix_index, &p->prefix, 0);
+ pool_put (a->adv_prefixes_pool, p);
+ }
+
+ vec_free (to_delete);
+ to_delete = 0;
+
+ /* clean up mldp group pool */
+ pool_foreach (m, a->mldp_group_pool, ({
+ vec_add1 (to_delete, m - a->mldp_group_pool);
+ }));
+
+ for (i = 0; i < vec_len (to_delete); i++)
+ {
+ m = pool_elt_at_index (a->mldp_group_pool, to_delete[i]);
+ mhash_unset (&a->address_to_mldp_index, &m->mcast_address, 0);
+ pool_put (a->mldp_group_pool, m);
+ }
+
+ vec_free (to_delete);
+
+ pool_put (nm->if_radv_pool, a);
+ nm->if_radv_pool_index_by_sw_if_index[sw_if_index] = ~0;
+ ri = ~0;
+ }
+ }
+ else
+ {
+ if(is_add)
+ {
+ vnet_hw_interface_t * hw_if0;
+
+ hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ pool_get (nm->if_radv_pool, a);
+
+ ri = a - nm->if_radv_pool;
+ nm->if_radv_pool_index_by_sw_if_index[sw_if_index] = ri;
+
+ /* initialize default values (most of which are zero) */
+ memset (a, 0, sizeof (a[0]));
+
+ a->sw_if_index = sw_if_index;
+ a->fib_index = ~0;
+ a->max_radv_interval = DEF_MAX_RADV_INTERVAL;
+ a->min_radv_interval = DEF_MIN_RADV_INTERVAL;
+ a->curr_hop_limit = DEF_CURR_HOP_LIMIT;
+ a->adv_router_lifetime_in_sec = DEF_DEF_RTR_LIFETIME;
+
+ a->adv_link_layer_address = 1; /* send ll address source address option */
+
+ a->min_delay_between_radv = MIN_DELAY_BETWEEN_RAS;
+ a->max_delay_between_radv = MAX_DELAY_BETWEEN_RAS;
+ a->max_rtr_default_lifetime = MAX_DEF_RTR_LIFETIME;
+ a->seed = random_default_seed();
+
+ /* for generating random interface ids */
+ a->randomizer = 0x1119194911191949;
+ a->randomizer = random_u64 ((u32 *)&a->randomizer);
+
+ a->initial_adverts_count = MAX_INITIAL_RTR_ADVERTISEMENTS ;
+ a->initial_adverts_sent = a->initial_adverts_count-1;
+ a->initial_adverts_interval = MAX_INITIAL_RTR_ADVERT_INTERVAL;
+
+ /* deafult is to send */
+ a->send_radv = 1;
+
+ /* fill in radv_info for this interface that will be needed later */
+ a->adv_link_mtu = hw_if0->max_l3_packet_bytes[VLIB_RX];
+
+ memcpy (a->link_layer_address, eth_if0->address, 6);
+
+ /* fill in default link-local address (this may be overridden) */
+ ip6_link_local_address_from_ethernet_address (&a->link_local_address, eth_if0->address);
+ a->link_local_prefix_len = 64;
+
+ mhash_init (&a->address_to_prefix_index, sizeof (uword), sizeof (ip6_address_t));
+ mhash_init (&a->address_to_mldp_index, sizeof (uword), sizeof (ip6_address_t));
+
+ {
+ ip_adjacency_t *adj;
+ u8 link_layer_address[6] =
+ {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_hosts};
+
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a->all_nodes_adj_index);
+
+ adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj->if_address_index = ~0;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP6,
+ sw_if_index,
+ ip6_rewrite_node.index,
+ link_layer_address,
+ &adj->rewrite_header,
+ sizeof (adj->rewrite_data));
+ }
+
+ {
+ ip_adjacency_t *adj;
+ u8 link_layer_address[6] =
+ {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_routers};
+
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a->all_routers_adj_index);
+
+ adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj->if_address_index = ~0;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP6,
+ sw_if_index,
+ ip6_rewrite_node.index,
+ link_layer_address,
+ &adj->rewrite_header,
+ sizeof (adj->rewrite_data));
+ }
+
+ {
+ ip_adjacency_t *adj;
+ u8 link_layer_address[6] =
+ {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_mldv2_routers};
+
+ adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1,
+ &a->all_mldv2_routers_adj_index);
+
+ adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj->if_address_index = ~0;
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_IP6,
+ sw_if_index,
+ ip6_rewrite_node.index,
+ link_layer_address,
+ &adj->rewrite_header,
+ sizeof (adj->rewrite_data));
+ }
+
+ /* add multicast groups we will always be reporting */
+ ip6_address_t addr;
+ ip6_mldp_group_t *mcast_group_info;
+
+ ip6_set_reserved_multicast_address (&addr,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_hosts);
+
+ /* lookup mldp info for this interface */
+
+ uword * p = mhash_get (&a->address_to_mldp_index, &addr);
+ mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0;
+
+ /* add address */
+ if(!mcast_group_info)
+ {
+ /* add */
+ u32 mi;
+ pool_get (a->mldp_group_pool, mcast_group_info);
+
+ mi = mcast_group_info - a->mldp_group_pool;
+ mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0);
+
+ mcast_group_info->type = 4;
+ mcast_group_info->mcast_source_address_pool = 0;
+ mcast_group_info->num_sources = 0;
+ memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t));
+ }
+
+ ip6_set_reserved_multicast_address (&addr,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_all_routers);
+
+ p = mhash_get (&a->address_to_mldp_index, &addr);
+ mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0;
+
+ if(!mcast_group_info)
+ {
+ /* add */
+ u32 mi;
+ pool_get (a->mldp_group_pool, mcast_group_info);
+
+ mi = mcast_group_info - a->mldp_group_pool;
+ mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0);
+
+ mcast_group_info->type = 4;
+ mcast_group_info->mcast_source_address_pool = 0;
+ mcast_group_info->num_sources = 0;
+ memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t));
+ }
+
+ ip6_set_reserved_multicast_address (&addr,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_mldv2_routers);
+
+ p = mhash_get (&a->address_to_mldp_index, &addr);
+ mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0;
+
+ if(!mcast_group_info)
+ {
+ /* add */
+ u32 mi;
+ pool_get (a->mldp_group_pool, mcast_group_info);
+
+ mi = mcast_group_info - a->mldp_group_pool;
+ mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0);
+
+ mcast_group_info->type = 4;
+ mcast_group_info->mcast_source_address_pool = 0;
+ mcast_group_info->num_sources = 0;
+ memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t));
+ }
+ }
+ }
+ return ri;
+}
+
+/* send an mldpv2 report */
+static void
+ip6_neighbor_send_mldpv2_report(u32 sw_if_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = vnm->vlib_main;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ vnet_sw_interface_t * sw_if0;
+ ethernet_interface_t * eth_if0;
+ u32 ri;
+ int bogus_length;
+
+ ip6_radv_t *radv_info;
+ u16 payload_length;
+ vlib_buffer_t * b0;
+ ip6_header_t * ip0;
+ u32 * to_next;
+ vlib_frame_t * f;
+ u32 bo0;
+ u32 n_to_alloc = 1;
+ u32 n_allocated;
+
+ icmp6_multicast_listener_report_header_t *rh0;
+ icmp6_multicast_listener_report_packet_t *rp0;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+
+ if (!eth_if0 || !vnet_sw_interface_is_admin_up (vnm, sw_if_index))
+ return;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ if(ri == ~0)
+ return;
+
+ /* send report now - build a mldpv2 report packet */
+ n_allocated = vlib_buffer_alloc_from_free_list(vm,
+ &bo0,
+ n_to_alloc,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ if (PREDICT_FALSE(n_allocated == 0))
+ {
+ clib_warning ("buffer allocation failure");
+ return;
+ }
+
+ b0 = vlib_get_buffer (vm, bo0);
+
+ /* adjust the sizeof the buffer to just include the ipv6 header */
+ b0->current_length = sizeof(icmp6_multicast_listener_report_packet_t);
+
+ payload_length = sizeof(icmp6_multicast_listener_report_header_t);
+
+ b0->error = ICMP6_ERROR_NONE;
+
+ rp0 = vlib_buffer_get_current (b0);
+ ip0 = (ip6_header_t *)&rp0-> ip;
+ rh0 = (icmp6_multicast_listener_report_header_t *)&rp0-> report_hdr;
+
+ memset (rp0 , 0x0, sizeof (icmp6_multicast_listener_report_packet_t));
+
+ ip0->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28);
+
+ ip0->protocol = IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS;
+ /* for DEBUG - vnet driver won't seem to emit router alerts */
+ /* ip0->protocol = IP_PROTOCOL_ICMP6; */
+ ip0->hop_limit = 1;
+
+ rh0->icmp.type = ICMP6_multicast_listener_report_v2;
+
+ /* source address MUST be the link-local address */
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+ ip0->src_address = radv_info->link_local_address;
+
+ /* destination is all mldpv2 routers */
+ ip6_set_reserved_multicast_address(&ip0->dst_address,
+ IP6_MULTICAST_SCOPE_link_local,
+ IP6_MULTICAST_GROUP_ID_mldv2_routers);
+
+ /* add reports here */
+ ip6_mldp_group_t *m;
+ int num_addr_records = 0;
+ icmp6_multicast_address_record_t rr;
+
+ /* fill in the hop-by-hop extension header (router alert) info */
+ rh0->ext_hdr.next_hdr = IP_PROTOCOL_ICMP6;
+ rh0->ext_hdr.n_data_u64s = 0;
+
+ rh0->alert.type = IP6_MLDP_ALERT_TYPE;
+ rh0->alert.len = 2;
+ rh0->alert.value = 0;
+
+ rh0->pad.type = 1;
+ rh0->pad.len = 0;
+
+ rh0->icmp.checksum = 0;
+
+ pool_foreach (m, radv_info->mldp_group_pool, ({
+
+ rr.type = m->type;
+ rr.aux_data_len_u32s = 0;
+ rr.num_sources = clib_host_to_net_u16 (m->num_sources);
+ memcpy(&rr.mcast_addr, &m->mcast_address, sizeof(ip6_address_t));
+
+ num_addr_records++;
+
+ vlib_buffer_add_data (vm,
+ b0->free_list_index,
+ bo0,
+ (void *)&rr, sizeof(icmp6_multicast_address_record_t));
+
+ payload_length += sizeof( icmp6_multicast_address_record_t);
+ }));
+
+ rh0->rsvd = 0;
+ rh0->num_addr_records = clib_host_to_net_u16(num_addr_records);
+
+ /* update lengths */
+ ip0->payload_length = clib_host_to_net_u16 (payload_length);
+
+ rh0->icmp.checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0,
+ &bogus_length);
+ ASSERT(bogus_length == 0);
+
+ /*
+ * OK to override w/ no regard for actual FIB, because
+ * ip6-rewrite-local only looks at the adjacency.
+ */
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] =
+ vnet_main.local_interface_sw_if_index;
+
+ vnet_buffer (b0)->ip.adj_index[VLIB_RX] =
+ radv_info->all_mldv2_routers_adj_index;
+
+ vlib_node_t * node = vlib_get_node_by_name (vm, (u8 *) "ip6-rewrite-local");
+
+ f = vlib_get_frame_to_node (vm, node->index);
+ to_next = vlib_frame_vector_args (f);
+ to_next[0] = bo0;
+ f->n_vectors = 1;
+
+ vlib_put_frame_to_node (vm, node->index, f);
+ return;
+}
+
+VLIB_REGISTER_NODE (ip6_icmp_router_solicitation_node,static) = {
+ .function = icmp6_router_solicitation,
+ .name = "icmp6-router-solicitation",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_next_nodes = ICMP6_ROUTER_SOLICITATION_N_NEXT,
+ .next_nodes = {
+ [ICMP6_ROUTER_SOLICITATION_NEXT_DROP] = "error-drop",
+ [ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW] = "ip6-rewrite-local",
+ [ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX] = "interface-output",
+ },
+};
+
+/* send a RA or update the timer info etc.. */
+static uword
+ip6_neighbor_process_timer_event (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_radv_t *radv_info;
+ vlib_frame_t * f = 0;
+ u32 n_this_frame = 0;
+ u32 n_left_to_next;
+ u32 * to_next;
+ u32 bo0;
+ icmp6_router_solicitation_header_t * h0;
+ vlib_buffer_t * b0;
+ f64 now = vlib_time_now (vm);
+
+ /* Interface ip6 radv info list */
+ pool_foreach (radv_info, nm->if_radv_pool, ({
+
+ if( !vnet_sw_interface_is_admin_up (vnm, radv_info->sw_if_index))
+ {
+ radv_info->initial_adverts_sent = radv_info->initial_adverts_count-1;
+ radv_info->next_multicast_time = now;
+ radv_info->last_multicast_time = now;
+ radv_info->last_radv_time = 0;
+ radv_info->all_routers_mcast = 0;
+ continue;
+ }
+
+ /* Make sure that we've joined the all-routers multicast group */
+ if(!radv_info->all_routers_mcast)
+ {
+ /* send MDLP_REPORT_EVENT message */
+ ip6_neighbor_send_mldpv2_report(radv_info->sw_if_index);
+ radv_info->all_routers_mcast = 1;
+ }
+
+ /* is it time to send a multicast RA on this interface? */
+ if(radv_info->send_radv && (now >= radv_info->next_multicast_time))
+ {
+ u32 n_to_alloc = 1;
+ u32 n_allocated;
+
+ f64 rfn = (radv_info->max_radv_interval - radv_info->min_radv_interval) *
+ random_f64 (&radv_info->seed) + radv_info->min_radv_interval;
+
+ /* multicast send - compute next multicast send time */
+ if( radv_info->initial_adverts_sent > 0)
+ {
+ radv_info->initial_adverts_sent--;
+ if(rfn > radv_info-> initial_adverts_interval)
+ rfn = radv_info-> initial_adverts_interval;
+
+ /* check to see if we are ceasing to send */
+ if( radv_info->initial_adverts_sent == 0)
+ if(radv_info->cease_radv)
+ radv_info->send_radv = 0;
+ }
+
+ radv_info->next_multicast_time = rfn + now;
+ radv_info->last_multicast_time = now;
+
+ /* send advert now - build a "solicted" router advert with unspecified source address */
+ n_allocated = vlib_buffer_alloc_from_free_list(vm,
+ &bo0,
+ n_to_alloc,
+ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+ if (PREDICT_FALSE(n_allocated == 0))
+ {
+ clib_warning ("buffer allocation failure");
+ continue;
+ }
+ b0 = vlib_get_buffer (vm, bo0);
+ b0->current_length = sizeof( icmp6_router_solicitation_header_t);
+ b0->error = ICMP6_ERROR_NONE;
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = radv_info->sw_if_index;
+
+ h0 = vlib_buffer_get_current (b0);
+
+ memset (h0, 0, sizeof (icmp6_router_solicitation_header_t));
+
+ h0->ip.ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28);
+ h0->ip.payload_length = clib_host_to_net_u16 (sizeof (icmp6_router_solicitation_header_t)
+ - STRUCT_OFFSET_OF (icmp6_router_solicitation_header_t, neighbor));
+ h0->ip.protocol = IP_PROTOCOL_ICMP6;
+ h0->ip.hop_limit = 255;
+
+ /* set src/dst address as "unspecified" this marks this packet as internally generated rather than recieved */
+ h0->ip.src_address.as_u64[0] = 0;
+ h0->ip.src_address.as_u64[1] = 0;
+
+ h0->ip.dst_address.as_u64[0] = 0;
+ h0->ip.dst_address.as_u64[1] = 0;
+
+ h0->neighbor.icmp.type = ICMP6_router_solicitation;
+
+ if (PREDICT_FALSE(f == 0))
+ {
+ f = vlib_get_frame_to_node (vm, ip6_icmp_router_solicitation_node.index);
+ to_next = vlib_frame_vector_args (f);
+ n_left_to_next = VLIB_FRAME_SIZE;
+ n_this_frame = 0;
+ }
+
+ n_this_frame++;
+ n_left_to_next--;
+ to_next[0] = bo0;
+ to_next += 1;
+
+ if (PREDICT_FALSE(n_left_to_next == 0))
+ {
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, ip6_icmp_router_solicitation_node.index, f);
+ f = 0;
+ }
+ }
+ }));
+
+ if (f)
+ {
+ ASSERT(n_this_frame);
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, ip6_icmp_router_solicitation_node.index, f);
+ }
+ return 0;
+}
+
+static uword
+ip6_icmp_neighbor_discovery_event_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ uword event_type;
+ ip6_icmp_neighbor_discovery_event_data_t * event_data;
+
+ /* init code here */
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, 1. /* seconds */);
+
+ event_data = vlib_process_get_event_data (vm, &event_type);
+
+ if(!event_data)
+ {
+ /* No events found: timer expired. */
+ /* process interface list and send RAs as appropriate, update timer info */
+ ip6_neighbor_process_timer_event (vm, node, frame);
+ }
+ else
+ {
+ switch (event_type) {
+
+ case ICMP6_ND_EVENT_INIT:
+ break;
+
+ case ~0:
+ break;
+
+ default:
+ ASSERT (0);
+ }
+
+ if (event_data)
+ _vec_len (event_data) = 0;
+ }
+ }
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ip6_icmp_router_advertisement_node,static) = {
+ .function = icmp6_router_advertisement,
+ .name = "icmp6-router-advertisement",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+vlib_node_registration_t ip6_icmp_neighbor_discovery_event_node = {
+
+ .function = ip6_icmp_neighbor_discovery_event_process,
+ .name = "ip6-icmp-neighbor-discovery-event-process",
+ .type = VLIB_NODE_TYPE_PROCESS,
+};
+
+static uword
+icmp6_neighbor_solicitation (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return icmp6_neighbor_solicitation_or_advertisement (vm, node, frame, /* is_solicitation */ 1); }
+
+static uword
+icmp6_neighbor_advertisement (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return icmp6_neighbor_solicitation_or_advertisement (vm, node, frame, /* is_solicitation */ 0); }
+
+VLIB_REGISTER_NODE (ip6_icmp_neighbor_solicitation_node,static) = {
+ .function = icmp6_neighbor_solicitation,
+ .name = "icmp6-neighbor-solicitation",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_next_nodes = ICMP6_NEIGHBOR_SOLICITATION_N_NEXT,
+ .next_nodes = {
+ [ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP] = "error-drop",
+ [ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY] = "interface-output",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_icmp_neighbor_advertisement_node,static) = {
+ .function = icmp6_neighbor_advertisement,
+ .name = "icmp6-neighbor-advertisement",
+
+ .vector_size = sizeof (u32),
+
+ .format_trace = format_icmp6_input_trace,
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+};
+
+/* API support functions */
+int
+ip6_neighbor_ra_config(vlib_main_t * vm, u32 sw_if_index,
+ u8 surpress, u8 managed, u8 other,
+ u8 ll_option, u8 send_unicast, u8 cease,
+ u8 use_lifetime, u32 lifetime,
+ u32 initial_count, u32 initial_interval,
+ u32 max_interval, u32 min_interval,
+ u8 is_no)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ int error;
+ u32 ri;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+ error = (ri != ~0) ? 0 : VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ if(!error)
+ {
+
+ ip6_radv_t * radv_info;
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ if((max_interval != 0) && (min_interval ==0))
+ min_interval = .75 * max_interval;
+
+ max_interval = (max_interval != 0) ? ( (is_no) ? DEF_MAX_RADV_INTERVAL : max_interval) : radv_info->max_radv_interval;
+ min_interval = (min_interval != 0) ? ( (is_no) ? DEF_MIN_RADV_INTERVAL : min_interval) : radv_info->min_radv_interval;
+ lifetime = (use_lifetime != 0) ? ( (is_no) ? DEF_DEF_RTR_LIFETIME : lifetime) : radv_info->adv_router_lifetime_in_sec;
+
+ if(lifetime)
+ {
+ if(lifetime > MAX_DEF_RTR_LIFETIME)
+ lifetime = MAX_DEF_RTR_LIFETIME;
+
+ if(lifetime <= max_interval)
+ return VNET_API_ERROR_INVALID_VALUE;
+ }
+
+ if(min_interval != 0)
+ {
+ if((min_interval > .75 * max_interval) ||
+ (min_interval < 3))
+ return VNET_API_ERROR_INVALID_VALUE;
+ }
+
+ if((initial_count > MAX_INITIAL_RTR_ADVERTISEMENTS) ||
+ (initial_interval > MAX_INITIAL_RTR_ADVERT_INTERVAL))
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ /*
+ if "flag" is set and is_no is true then restore default value else set value corresponding to "flag"
+ if "flag" is clear don't change corresponding value
+ */
+ radv_info->send_radv = (surpress != 0) ? ( (is_no != 0) ? 1 : 0 ) : radv_info->send_radv;
+ radv_info->adv_managed_flag = ( managed != 0) ? ( (is_no) ? 0 : 1) : radv_info->adv_managed_flag;
+ radv_info->adv_other_flag = (other != 0) ? ( (is_no) ? 0: 1) : radv_info->adv_other_flag;
+ radv_info->adv_link_layer_address = ( ll_option != 0) ? ( (is_no) ? 1 : 0) : radv_info->adv_link_layer_address;
+ radv_info->send_unicast = (send_unicast != 0) ? ( (is_no) ? 0 : 1) : radv_info->send_unicast;
+ radv_info->cease_radv = ( cease != 0) ? ( (is_no) ? 0 : 1) : radv_info->cease_radv;
+
+ radv_info->min_radv_interval = min_interval;
+ radv_info->max_radv_interval = max_interval;
+ radv_info->adv_router_lifetime_in_sec = lifetime;
+
+ radv_info->initial_adverts_count =
+ (initial_count != 0) ? ( (is_no) ? MAX_INITIAL_RTR_ADVERTISEMENTS : initial_count) : radv_info->initial_adverts_count ;
+ radv_info->initial_adverts_interval =
+ (initial_interval != 0) ? ( (is_no) ? MAX_INITIAL_RTR_ADVERT_INTERVAL : initial_interval) : radv_info->initial_adverts_interval;
+
+ /* restart */
+ if((cease != 0) && (is_no))
+ radv_info-> send_radv = 1;
+
+ radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1;
+ radv_info->next_multicast_time = vlib_time_now (vm);
+ radv_info->last_multicast_time = vlib_time_now (vm);
+ radv_info->last_radv_time = 0;
+ }
+ return(error);
+}
+
+int
+ip6_neighbor_ra_prefix(vlib_main_t * vm, u32 sw_if_index,
+ ip6_address_t *prefix_addr, u8 prefix_len,
+ u8 use_default, u32 val_lifetime, u32 pref_lifetime,
+ u8 no_advertise, u8 off_link, u8 no_autoconfig, u8 no_onlink,
+ u8 is_no)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ int error;
+
+ u32 ri;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ error = (ri != ~0) ? 0 : VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ if(!error)
+ {
+ f64 now = vlib_time_now (vm);
+ ip6_radv_t * radv_info;
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ /* prefix info add, delete or update */
+ ip6_radv_prefix_t * prefix;
+
+ /* lookup prefix info for this address on this interface */
+ uword * p = mhash_get (&radv_info->address_to_prefix_index, prefix_addr);
+
+ prefix = p ? pool_elt_at_index (radv_info->adv_prefixes_pool, p[0]) : 0;
+
+ if(is_no)
+ {
+ /* delete */
+ if(!prefix)
+ return VNET_API_ERROR_INVALID_VALUE; /* invalid prefix */
+
+ if(prefix->prefix_len != prefix_len)
+ return VNET_API_ERROR_INVALID_VALUE_2;
+
+ /* FIXME - Should the DP do this or the CP ?*/
+ /* do specific delete processing here before returning */
+ /* try to remove from routing table */
+
+ mhash_unset (&radv_info->address_to_prefix_index, prefix_addr,/* old_value */ 0);
+ pool_put (radv_info->adv_prefixes_pool, prefix);
+
+ radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1;
+ radv_info->next_multicast_time = vlib_time_now (vm);
+ radv_info->last_multicast_time = vlib_time_now (vm);
+ radv_info->last_radv_time = 0;
+ return(error);
+ }
+
+ /* adding or changing */
+ if(!prefix)
+ {
+ /* add */
+ u32 pi;
+ pool_get (radv_info->adv_prefixes_pool, prefix);
+ pi = prefix - radv_info->adv_prefixes_pool;
+ mhash_set (&radv_info->address_to_prefix_index, prefix_addr, pi, /* old_value */ 0);
+
+ memset(prefix, 0x0, sizeof(ip6_radv_prefix_t));
+
+ prefix->prefix_len = prefix_len;
+ memcpy(&prefix->prefix, prefix_addr, sizeof(ip6_address_t));
+
+ /* initialize default values */
+ prefix->adv_on_link_flag = 1; /* L bit set */
+ prefix->adv_autonomous_flag = 1; /* A bit set */
+ prefix->adv_valid_lifetime_in_secs = DEF_ADV_VALID_LIFETIME;
+ prefix->adv_pref_lifetime_in_secs = DEF_ADV_PREF_LIFETIME;
+ prefix->enabled = 1;
+ prefix->decrement_lifetime_flag = 1;
+ prefix->deprecated_prefix_flag = 1;
+
+ if(off_link == 0)
+ {
+ /* FIXME - Should the DP do this or the CP ?*/
+ /* insert prefix into routing table as a connected prefix */
+ }
+
+ if(use_default)
+ goto restart;
+ }
+ else
+ {
+
+ if(prefix->prefix_len != prefix_len)
+ return VNET_API_ERROR_INVALID_VALUE_2;
+
+ if(off_link != 0)
+ {
+ /* FIXME - Should the DP do this or the CP ?*/
+ /* remove from routing table if already there */
+ }
+ }
+
+ if((val_lifetime == ~0) || (pref_lifetime == ~0))
+ {
+ prefix->adv_valid_lifetime_in_secs = ~0;
+ prefix->adv_pref_lifetime_in_secs = ~0;
+ prefix->decrement_lifetime_flag = 0;
+ }
+ else
+ {
+ prefix->adv_valid_lifetime_in_secs = val_lifetime;;
+ prefix->adv_pref_lifetime_in_secs = pref_lifetime;
+ }
+
+ /* copy remaining */
+ prefix->enabled = !(no_advertise != 0);
+ prefix->adv_on_link_flag = !((off_link != 0) || (no_onlink != 0));
+ prefix->adv_autonomous_flag = !(no_autoconfig != 0);
+
+ restart:
+ /* restart */
+ /* fill in the expiration times */
+ prefix->valid_lifetime_expires = now + prefix->adv_valid_lifetime_in_secs;
+ prefix->pref_lifetime_expires = now + prefix->adv_pref_lifetime_in_secs;
+
+ radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1;
+ radv_info->next_multicast_time = vlib_time_now (vm);
+ radv_info->last_multicast_time = vlib_time_now (vm);
+ radv_info->last_radv_time = 0;
+ }
+ return(error);
+}
+
+clib_error_t *
+ip6_neighbor_cmd(vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ clib_error_t * error = 0;
+ u8 is_no = 0;
+ u8 surpress = 0, managed = 0, other = 0;
+ u8 surpress_ll_option = 0, send_unicast = 0, cease= 0;
+ u8 use_lifetime = 0;
+ u32 sw_if_index, ra_lifetime = 0, ra_initial_count = 0, ra_initial_interval = 0;
+ u32 ra_max_interval = 0 , ra_min_interval = 0;
+
+ unformat_input_t _line_input, * line_input = &_line_input;
+ vnet_sw_interface_t * sw_if0;
+
+ int add_radv_info = 1;
+ __attribute__((unused)) ip6_radv_t * radv_info = 0;
+ ip6_address_t ip6_addr;
+ u32 addr_len;
+
+
+ /* Get a line of input. */
+ if (! unformat_user (main_input, unformat_line_input, line_input))
+ return 0;
+
+ /* get basic radv info for this interface */
+ if(unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+
+ if (unformat_user (line_input,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ u32 ri;
+ ethernet_interface_t * eth_if0 = 0;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+
+ if(!eth_if0)
+ {
+ error = clib_error_return (0, "Interface must be of ethernet type");
+ goto done;
+ }
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ if(ri != ~0)
+ {
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown interface %U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+ else
+ {
+ error = clib_error_return (0, "invalid interface name %U'",
+ format_unformat_error, line_input);
+ goto done;
+ }
+ }
+
+ /* get the rest of the command */
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "no"))
+ is_no = 1;
+ else if(unformat (line_input, "prefix %U/%d",
+ unformat_ip6_address, &ip6_addr,
+ &addr_len))
+ {
+ add_radv_info = 0;
+ break;
+ }
+ else if (unformat (line_input, "ra-managed-config-flag"))
+ {
+ managed = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-other-config-flag"))
+ {
+ other = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-surpress"))
+ {
+ surpress = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-surpress-link-layer"))
+ {
+ surpress_ll_option = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-send-unicast"))
+ {
+ send_unicast = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-lifetime"))
+ {
+ if (!unformat (line_input, "%d", &ra_lifetime))
+ return(error = unformat_parse_error (line_input));
+ use_lifetime = 1;
+ break;
+ }
+ else if (unformat (line_input, "ra-initial"))
+ {
+ if (!unformat (line_input, "%d %d", &ra_initial_count, &ra_initial_interval))
+ return(error = unformat_parse_error (line_input));
+ break;
+ }
+ else if (unformat (line_input, "ra-interval"))
+ {
+ if (!unformat (line_input, "%d", &ra_max_interval))
+ return(error = unformat_parse_error (line_input));
+
+ if (!unformat (line_input, "%d", &ra_min_interval))
+ ra_min_interval = 0;
+ break;
+ }
+ else if(unformat (line_input, "ra-cease"))
+ {
+ cease = 1;
+ break;
+ }
+ else
+ return(unformat_parse_error (line_input));
+ }
+
+ if(add_radv_info)
+ {
+ ip6_neighbor_ra_config(vm, sw_if_index,
+ surpress, managed, other,
+ surpress_ll_option, send_unicast, cease,
+ use_lifetime, ra_lifetime,
+ ra_initial_count, ra_initial_interval,
+ ra_max_interval, ra_min_interval,
+ is_no);
+ }
+ else
+ {
+ u32 valid_lifetime_in_secs = 0;
+ u32 pref_lifetime_in_secs = 0;
+ u8 use_prefix_default_values = 0;
+ u8 no_advertise = 0;
+ u8 off_link= 0;
+ u8 no_autoconfig = 0;
+ u8 no_onlink= 0;
+
+ /* get the rest of the command */
+ while(unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if(unformat (line_input, "default"))
+ {
+ use_prefix_default_values = 1;
+ break;
+ }
+ else if(unformat (line_input, "infinite"))
+ {
+ valid_lifetime_in_secs = ~0;
+ pref_lifetime_in_secs = ~0;
+ break;
+ }
+ else if(unformat (line_input, "%d %d", &valid_lifetime_in_secs,
+ &pref_lifetime_in_secs))
+ break;
+ else
+ break;
+ }
+
+
+ /* get the rest of the command */
+ while (!use_prefix_default_values &&
+ unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if(unformat (line_input, "no-advertise"))
+ no_advertise = 1;
+ else if(unformat (line_input, "off-link"))
+ off_link = 1;
+ else if(unformat (line_input, "no-autoconfig"))
+ no_autoconfig = 1;
+ else if(unformat (line_input, "no-onlink"))
+ no_onlink = 1;
+ else
+ return(unformat_parse_error (line_input));
+ }
+
+ ip6_neighbor_ra_prefix(vm, sw_if_index,
+ &ip6_addr, addr_len,
+ use_prefix_default_values,
+ valid_lifetime_in_secs,
+ pref_lifetime_in_secs,
+ no_advertise,
+ off_link,
+ no_autoconfig,
+ no_onlink,
+ is_no);
+ }
+
+ unformat_free (line_input);
+
+ done:
+ return error;
+}
+
+static clib_error_t *
+show_ip6_interface_cmd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ sw_if_index = ~0;
+
+ if (unformat_user (input,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ u32 ri;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ if(ri != ~0)
+ {
+ ip_lookup_main_t * lm = &ip6_main.lookup_main;
+ ip6_radv_t * radv_info;
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ vlib_cli_output (vm, "%U is admin %s\n", format_vnet_sw_interface_name, vnm,
+ vnet_get_sw_interface (vnm, sw_if_index),
+ (vnet_sw_interface_is_admin_up (vnm, sw_if_index) ? "up" : "down"));
+
+ u32 ai;
+ u32 *global_scope = 0,i;
+ ip_interface_address_t * a;
+
+ vec_validate_init_empty (lm->if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ai = lm->if_address_pool_index_by_sw_if_index[sw_if_index];
+
+ while (ai != (u32)~0)
+ {
+ a = pool_elt_at_index(lm->if_address_pool, ai);
+ ip6_address_t * address = ip_interface_address_get_address (lm, a);
+
+ if( ip6_address_is_link_local_unicast (address))
+ vlib_cli_output (vm, "\tIPv6 is enabled, link-local address is %U\n", format_ip6_address,
+ address);
+
+ if((address->as_u8[0] & 0xe0) == 0x20)
+ vec_add1 (global_scope, ai);
+
+ ai = a->next_this_sw_interface;
+ }
+
+ vlib_cli_output (vm, "\tGlobal unicast address(es):\n");
+ for (i = 0; i < vec_len (global_scope); i++)
+ {
+ a = pool_elt_at_index(lm->if_address_pool, global_scope[i]);
+ ip6_address_t * address = ip_interface_address_get_address (lm, a);
+ ip6_address_t mask, subnet;
+
+ subnet = *address;
+ ip6_address_mask_from_width(&mask, a->address_length);
+ ip6_address_mask(&subnet, &mask);
+
+ vlib_cli_output (vm, "\t\t%U, subnet is %U/%d",
+ format_ip6_address, address,
+ format_ip6_address,&subnet,
+ a->address_length);
+ }
+ vec_free (global_scope);
+ vlib_cli_output (vm, "\tJoined group address(es):\n");
+ ip6_mldp_group_t *m;
+ pool_foreach (m, radv_info->mldp_group_pool, ({
+ vlib_cli_output (vm, "\t\t%U\n", format_ip6_address, &m->mcast_address);
+ }));
+
+ vlib_cli_output (vm, "\tAdvertised Prefixes:\n");
+ ip6_radv_prefix_t * p;
+ pool_foreach (p, radv_info->adv_prefixes_pool, ({
+ vlib_cli_output (vm, "\t\tprefix %U, length %d\n",
+ format_ip6_address, &p->prefix, p->prefix_len);
+ }));
+
+ vlib_cli_output (vm, "\tMTU is %d\n", radv_info->adv_link_mtu);
+ vlib_cli_output (vm, "\tICMP error messages are unlimited\n");
+ vlib_cli_output (vm, "\tICMP redirects are disabled\n");
+ vlib_cli_output (vm, "\tICMP unreachables are not sent\n");
+ vlib_cli_output (vm, "\tND DAD is disabled\n");
+ //vlib_cli_output (vm, "\tND reachable time is %d milliseconds\n",);
+ vlib_cli_output (vm, "\tND advertised reachable time is %d\n",
+ radv_info->adv_neighbor_reachable_time_in_msec);
+ vlib_cli_output (vm, "\tND advertised retransmit interval is %d (msec)\n",
+ radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations);
+
+ u32 ra_interval = radv_info->max_radv_interval;
+ u32 ra_interval_min = radv_info->min_radv_interval;
+ vlib_cli_output (vm, "\tND router advertisements are sent every %d seconds (min interval is %d)\n",
+ ra_interval, ra_interval_min);
+ vlib_cli_output (vm, "\tND router advertisements live for %d seconds\n",
+ radv_info->adv_router_lifetime_in_sec);
+ vlib_cli_output (vm, "\tHosts %s stateless autoconfig for addresses\n",
+ (radv_info->adv_managed_flag) ? "use" :" don't use");
+ vlib_cli_output (vm, "\tND router advertisements sent %d\n", radv_info->n_advertisements_sent);
+ vlib_cli_output (vm, "\tND router solicitations received %d\n", radv_info->n_solicitations_rcvd);
+ vlib_cli_output (vm, "\tND router solicitations dropped %d\n", radv_info->n_solicitations_dropped);
+ }
+ else
+ {
+ error = clib_error_return (0, "Ipv6 not enabled on interface",
+ format_unformat_error, input);
+
+ }
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_ip6_interface_command, static) = {
+ .path = "show ip6 interface",
+ .function = show_ip6_interface_cmd,
+ .short_help = "Show ip6 interface <iface name>",
+};
+
+clib_error_t *
+disable_ip6_interface(vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ clib_error_t * error = 0;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ u32 ri;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ /* if not created - do nothing */
+ if(ri != ~0)
+ {
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_radv_t * radv_info;
+
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ /* check radv_info ref count for other ip6 addresses on this interface */
+ if(radv_info->ref_count == 0 )
+ {
+ /* essentially "disables" ipv6 on this interface */
+ error = ip6_add_del_interface_address (vm, sw_if_index,
+ &radv_info->link_local_address,
+ radv_info->link_local_prefix_len,
+ 1 /* is_del */);
+
+ ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, 0/* is_add */);
+ }
+ }
+ return error;
+}
+
+int
+ip6_interface_enabled(vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ u32 ri = ~0;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ return ri != ~0;
+}
+
+clib_error_t *
+enable_ip6_interface(vlib_main_t * vm,
+ u32 sw_if_index)
+{
+ clib_error_t * error = 0;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ u32 ri;
+ int is_add = 1;
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ /* if not created yet */
+ if(ri == ~0)
+ {
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_sw_interface_t * sw_if0;
+
+ sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index);
+ if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE)
+ {
+ ethernet_interface_t * eth_if0;
+
+ eth_if0 = ethernet_get_interface (&ethernet_main, sw_if0->hw_if_index);
+ if(eth_if0)
+ {
+ /* create radv_info. for this interface. This holds all the info needed for router adverts */
+ ri = ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, is_add);
+
+ if(ri != ~0)
+ {
+ ip6_radv_t * radv_info;
+ ip6_address_t link_local_address;
+
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ ip6_link_local_address_from_ethernet_mac_address (&link_local_address,
+ eth_if0->address);
+
+ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index);
+ if(sw_if0->type == VNET_SW_INTERFACE_TYPE_SUB)
+ {
+ /* make up an interface id */
+ md5_context_t m;
+ u8 digest[16];
+
+ link_local_address.as_u64[0] = radv_info->randomizer;
+
+ md5_init (&m);
+ md5_add (&m, &link_local_address, 16);
+ md5_finish (&m, digest);
+
+ memcpy(&link_local_address, digest, 16);
+
+ radv_info->randomizer = link_local_address.as_u64[0];
+
+ link_local_address.as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL);
+ /* clear u bit */
+ link_local_address.as_u8[8] &= 0xfd;
+ }
+
+ /* essentially "enables" ipv6 on this interface */
+ error = ip6_add_del_interface_address (vm, sw_if_index,
+ &link_local_address, 64 /* address width */,
+ 0 /* is_del */);
+
+ if(error)
+ ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, !is_add);
+ else
+ {
+ radv_info->link_local_address = link_local_address;
+ radv_info->link_local_prefix_len = 64;
+ }
+ }
+ }
+ }
+ }
+ return error;
+}
+
+static clib_error_t *
+enable_ip6_interface_cmd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ sw_if_index = ~0;
+
+ if (unformat_user (input,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ enable_ip6_interface(vm, sw_if_index);
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown interface\n'",
+ format_unformat_error, input);
+
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (enable_ip6_interface_command, static) = {
+ .path = "enable ip6 interface",
+ .function = enable_ip6_interface_cmd,
+ .short_help = "enable ip6 interface <iface name>",
+};
+
+static clib_error_t *
+disable_ip6_interface_cmd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ sw_if_index = ~0;
+
+ if (unformat_user (input,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = disable_ip6_interface(vm, sw_if_index);
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown interface\n'",
+ format_unformat_error, input);
+
+ }
+ return error;
+}
+
+VLIB_CLI_COMMAND (disable_ip6_interface_command, static) = {
+ .path = "disable ip6 interface",
+ .function = disable_ip6_interface_cmd,
+ .short_help = "disable ip6 interface <iface name>",
+};
+
+VLIB_CLI_COMMAND (ip6_nd_command, static) = {
+ .path = "ip6 nd",
+ .short_help = "Set ip6 neighbor discovery parameters",
+ .function = ip6_neighbor_cmd,
+};
+
+clib_error_t *
+set_ip6_link_local_address(vlib_main_t * vm,
+ u32 sw_if_index,
+ ip6_address_t *address,
+ u8 address_length)
+{
+ clib_error_t * error = 0;
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ u32 ri;
+ ip6_radv_t * radv_info;
+ vnet_main_t * vnm = vnet_get_main();
+
+ if( !ip6_address_is_link_local_unicast (address))
+ {
+ vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_LINK_LOCAL;
+ return(error = clib_error_return (0, "address not link-local",
+ format_unformat_error));
+ }
+
+ /* call enable ipv6 */
+ enable_ip6_interface(vm, sw_if_index);
+
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+
+ if(ri != ~0)
+ {
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ /* save if link local address (overwrite default) */
+
+ /* delete the old one */
+ error = ip6_add_del_interface_address (vm, sw_if_index,
+ &radv_info->link_local_address,
+ radv_info->link_local_prefix_len /* address width */,
+ 1 /* is_del */);
+
+ if(!error)
+ {
+ /* add the new one */
+ error = ip6_add_del_interface_address (vm, sw_if_index,
+ address ,
+ address_length /* address width */,
+ 0/* is_del */);
+
+ if(!error)
+ {
+ radv_info->link_local_address = *address;
+ radv_info->link_local_prefix_len = address_length;
+ }
+ }
+ }
+ else
+ {
+ vnm->api_errno = VNET_API_ERROR_IP6_NOT_ENABLED;
+ error = clib_error_return (0, "ip6 not enabled for interface",
+ format_unformat_error);
+ }
+ return error;
+}
+
+clib_error_t *
+set_ip6_link_local_address_cmd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ ip6_address_t ip6_addr;
+ u32 addr_len = 0;
+
+ if (unformat_user (input,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ /* get the rest of the command */
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if(unformat (input, "%U/%d",
+ unformat_ip6_address, &ip6_addr,
+ &addr_len))
+ break;
+ else
+ return(unformat_parse_error (input));
+ }
+ }
+ error = set_ip6_link_local_address(vm,
+ sw_if_index,
+ &ip6_addr,
+ addr_len);
+ return error;
+}
+
+VLIB_CLI_COMMAND (set_ip6_link_local_address_command, static) = {
+ .path = "set ip6 link-local address",
+ .short_help = "Set ip6 interface link-local address <intfc> <address.>",
+ .function = set_ip6_link_local_address_cmd,
+};
+
+/* callback when an interface address is added or deleted */
+static void
+ip6_neighbor_add_del_interface_address (ip6_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip6_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_delete)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ u32 ri;
+ vlib_main_t * vm = vnm->vlib_main;
+ ip6_radv_t * radv_info;
+ ip6_address_t a;
+ ip6_mldp_group_t *mcast_group_info;
+
+ /* create solicited node multicast address for this interface adddress */
+ ip6_set_solicited_node_multicast_address (&a, 0);
+
+ a.as_u8[0xd] = address->as_u8[0xd];
+ a.as_u8[0xe] = address->as_u8[0xe];
+ a.as_u8[0xf] = address->as_u8[0xf];
+
+ if(!is_delete)
+ {
+ /* try to create radv_info - does nothing if ipv6 already enabled */
+ enable_ip6_interface(vm, sw_if_index);
+
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+ if(ri != ~0)
+ {
+ /* get radv_info */
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ /* add address */
+ if( !ip6_address_is_link_local_unicast (address))
+ radv_info->ref_count++;
+
+ /* lookup prefix info for this address on this interface */
+ uword * p = mhash_get (&radv_info->address_to_mldp_index, &a);
+ mcast_group_info = p ? pool_elt_at_index (radv_info->mldp_group_pool, p[0]) : 0;
+
+ /* add -solicted node multicast address */
+ if(!mcast_group_info)
+ {
+ /* add */
+ u32 mi;
+ pool_get (radv_info->mldp_group_pool, mcast_group_info);
+
+ mi = mcast_group_info - radv_info->mldp_group_pool;
+ mhash_set (&radv_info->address_to_mldp_index, &a, mi, /* old_value */ 0);
+
+ mcast_group_info->type = 4;
+ mcast_group_info->mcast_source_address_pool = 0;
+ mcast_group_info->num_sources = 0;
+ memcpy(&mcast_group_info->mcast_address, &a, sizeof(ip6_address_t));
+ }
+ }
+ }
+ else
+ {
+
+ /* delete */
+ /* look up the radv_t information for this interface */
+ vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0);
+ ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index];
+ if(ri != ~0)
+ {
+ /* get radv_info */
+ radv_info = pool_elt_at_index (nm->if_radv_pool, ri);
+
+ /* lookup prefix info for this address on this interface */
+ uword * p = mhash_get (&radv_info->address_to_mldp_index, &a);
+ mcast_group_info = p ? pool_elt_at_index (radv_info->mldp_group_pool, p[0]) : 0;
+
+ if(mcast_group_info)
+ {
+ mhash_unset (&radv_info->address_to_mldp_index, &a,/* old_value */ 0);
+ pool_put (radv_info->mldp_group_pool, mcast_group_info);
+ }
+
+ /* if interface up send MLDP "report" */
+ radv_info->all_routers_mcast = 0;
+
+ /* add address */
+ if( !ip6_address_is_link_local_unicast (address))
+ radv_info->ref_count--;
+ }
+ }
+}
+
+clib_error_t *ip6_set_neighbor_limit (u32 neighbor_limit)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+
+ nm->limit_neighbor_cache_size = neighbor_limit;
+ return 0;
+}
+
+static clib_error_t * ip6_neighbor_init (vlib_main_t * vm)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_main_t * im = &ip6_main;
+
+ mhash_init (&nm->neighbor_index_by_key,
+ /* value size */ sizeof (uword),
+ /* key size */ sizeof (ip6_neighbor_key_t));
+
+ icmp6_register_type (vm, ICMP6_neighbor_solicitation, ip6_icmp_neighbor_solicitation_node.index);
+ icmp6_register_type (vm, ICMP6_neighbor_advertisement, ip6_icmp_neighbor_advertisement_node.index);
+ icmp6_register_type (vm, ICMP6_router_solicitation, ip6_icmp_router_solicitation_node.index);
+ icmp6_register_type (vm, ICMP6_router_advertisement, ip6_icmp_router_advertisement_node.index);
+
+ /* handler node for ip6 neighbor discovery events and timers */
+ vlib_register_node (vm, &ip6_icmp_neighbor_discovery_event_node);
+
+ /* add call backs */
+ ip6_add_del_interface_address_callback_t cb;
+ memset(&cb, 0x0, sizeof(ip6_add_del_interface_address_callback_t));
+
+ /* when an interface address changes... */
+ cb.function = ip6_neighbor_add_del_interface_address;
+ cb.function_opaque = 0;
+ vec_add1 (im->add_del_interface_address_callbacks, cb);
+
+ mhash_init (&nm->pending_resolutions_by_address,
+ /* value size */ sizeof (uword),
+ /* key size */ sizeof (ip6_address_t));
+
+ /* default, configurable */
+ nm->limit_neighbor_cache_size = 50000;
+
+#if 0
+ /* $$$$ Hack fix for today */
+ vec_validate_init_empty
+ (im->discover_neighbor_next_index_by_hw_if_index, 32, 0 /* drop */);
+#endif
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip6_neighbor_init);
+
+
+void vnet_register_ip6_neighbor_resolution_event (vnet_main_t * vnm,
+ void * address_arg,
+ uword node_index,
+ uword type_opaque,
+ uword data)
+{
+ ip6_neighbor_main_t * nm = &ip6_neighbor_main;
+ ip6_address_t * address = address_arg;
+ uword * p;
+ pending_resolution_t * pr;
+
+ pool_get (nm->pending_resolutions, pr);
+
+ pr->next_index = ~0;
+ pr->node_index = node_index;
+ pr->type_opaque = type_opaque;
+ pr->data = data;
+
+ p = mhash_get (&nm->pending_resolutions_by_address, address);
+ if (p)
+ {
+ /* Insert new resolution at the head of the list */
+ pr->next_index = p[0];
+ mhash_unset (&nm->pending_resolutions_by_address, address, 0);
+ }
+
+ mhash_set (&nm->pending_resolutions_by_address, address,
+ pr - nm->pending_resolutions, 0 /* old value */);
+}
+
diff --git a/vnet/vnet/ip/ip6_packet.h b/vnet/vnet/ip/ip6_packet.h
new file mode 100644
index 00000000000..9a52cf72586
--- /dev/null
+++ b/vnet/vnet/ip/ip6_packet.h
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip6/packet.h: ip6 packet format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip6_packet_h
+#define included_ip6_packet_h
+
+typedef union {
+ u8 as_u8[16];
+ u16 as_u16[8];
+ u32 as_u32[4];
+ u64 as_u64[2];
+ uword as_uword[16 / sizeof (uword)];
+} ip6_address_t;
+
+/* Packed so that the mhash key doesn't include uninitialized pad bytes */
+typedef CLIB_PACKED (struct {
+ /* IP address must be first for ip_interface_address_get_address() to work */
+ ip6_address_t ip6_addr;
+ u32 fib_index;
+}) ip6_address_fib_t;
+
+always_inline void
+ip6_addr_fib_init (ip6_address_fib_t * addr_fib, ip6_address_t * address,
+ u32 fib_index)
+{
+ addr_fib->ip6_addr.as_u64[0] = address->as_u64[0];
+ addr_fib->ip6_addr.as_u64[1] = address->as_u64[1];
+ addr_fib->fib_index = fib_index;
+}
+
+/* Special addresses:
+ unspecified ::/128
+ loopback ::1/128
+ global unicast 2000::/3
+ unique local unicast fc00::/7
+ link local unicast fe80::/10
+ multicast ff00::/8
+ ietf reserved everything else. */
+
+#define foreach_ip6_multicast_address_scope \
+ _ (loopback, 0x1) \
+ _ (link_local, 0x2) \
+ _ (admin_local, 0x4) \
+ _ (site_local, 0x5) \
+ _ (organization_local, 0x8) \
+ _ (global, 0xe)
+
+#define foreach_ip6_multicast_link_local_group_id \
+ _ (all_hosts, 0x1) \
+ _ (all_routers, 0x2) \
+ _ (rip_routers, 0x9) \
+ _ (eigrp_routers, 0xa) \
+ _ (pim_routers, 0xd) \
+ _ (mldv2_routers, 0x16)
+
+typedef enum {
+#define _(f,n) IP6_MULTICAST_SCOPE_##f = n,
+ foreach_ip6_multicast_address_scope
+#undef _
+} ip6_multicast_address_scope_t;
+
+typedef enum {
+#define _(f,n) IP6_MULTICAST_GROUP_ID_##f = n,
+ foreach_ip6_multicast_link_local_group_id
+#undef _
+} ip6_multicast_link_local_group_id_t;
+
+always_inline uword
+ip6_address_is_multicast (ip6_address_t * a)
+{ return a->as_u8[0] == 0xff; }
+
+always_inline void
+ip6_set_reserved_multicast_address (ip6_address_t * a,
+ ip6_multicast_address_scope_t scope,
+ u16 id)
+{
+ a->as_u64[0] = a->as_u64[1] = 0;
+ a->as_u16[0] = clib_host_to_net_u16 (0xff00 | scope);
+ a->as_u16[7] = clib_host_to_net_u16 (id);
+}
+
+always_inline void
+ip6_set_solicited_node_multicast_address (ip6_address_t * a, u32 id)
+{
+ /* 0xff02::1:ffXX:XXXX. */
+ a->as_u64[0] = a->as_u64[1] = 0;
+ a->as_u16[0] = clib_host_to_net_u16 (0xff02);
+ a->as_u8[11] = 1;
+ ASSERT ((id >> 24) == 0);
+ id |= 0xff << 24;
+ a->as_u32[3] = clib_host_to_net_u32 (id);
+}
+
+always_inline void
+ip6_link_local_address_from_ethernet_address (ip6_address_t * a, u8 * ethernet_address)
+{
+ a->as_u64[0] = a->as_u64[1] = 0;
+ a->as_u16[0] = clib_host_to_net_u16 (0xfe80);
+ /* Always set locally administered bit (6). */
+ a->as_u8[0x8] = ethernet_address[0] | (1 << 6);
+ a->as_u8[0x9] = ethernet_address[1];
+ a->as_u8[0xa] = ethernet_address[2];
+ a->as_u8[0xb] = 0xff;
+ a->as_u8[0xc] = 0xfe;
+ a->as_u8[0xd] = ethernet_address[3];
+ a->as_u8[0xe] = ethernet_address[4];
+ a->as_u8[0xf] = ethernet_address[5];
+}
+
+always_inline void
+ip6_multicast_ethernet_address (u8 * ethernet_address, u32 group_id)
+{
+ ethernet_address[0] = 0x33;
+ ethernet_address[1] = 0x33;
+ ethernet_address[2] = ((group_id >> 24) & 0xff);
+ ethernet_address[3] = ((group_id >> 16) & 0xff);
+ ethernet_address[4] = ((group_id >> 8) & 0xff);
+ ethernet_address[5] = ((group_id >> 0) & 0xff);
+}
+
+always_inline uword
+ip6_address_is_equal (ip6_address_t * a, ip6_address_t * b)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a->as_uword); i++)
+ if (a->as_uword[i] != b->as_uword[i])
+ return 0;
+ return 1;
+}
+
+always_inline uword
+ip6_address_is_equal_masked (ip6_address_t * a, ip6_address_t * b,
+ ip6_address_t * mask)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a->as_uword); i++)
+ {
+ uword a_masked, b_masked;
+ a_masked = a->as_uword[i] & mask->as_uword[i];
+ b_masked = b->as_uword[i] & mask->as_uword[i];
+
+ if (a_masked != b_masked)
+ return 0;
+ }
+ return 1;
+}
+
+always_inline void
+ip6_address_mask (ip6_address_t * a, ip6_address_t * mask)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a->as_uword); i++)
+ a->as_uword[i] &= mask->as_uword[i];
+}
+
+always_inline void
+ip6_address_set_zero (ip6_address_t * a)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a->as_uword); i++)
+ a->as_uword[i] = 0;
+}
+
+always_inline void
+ip6_address_mask_from_width (ip6_address_t * a, u32 width)
+{
+ int i, byte, bit, bitnum;
+ ASSERT (width <= 128);
+ memset (a, 0, sizeof (a[0]));
+ for (i = 0; i < width; i++)
+ {
+ bitnum = (7 - (i & 7));
+ byte = i / 8;
+ bit = 1<<bitnum;
+ a->as_u8[byte] |= bit;
+ }
+}
+
+always_inline uword
+ip6_address_is_zero (ip6_address_t * a)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (a->as_uword); i++)
+ if (a->as_uword[i] != 0)
+ return 0;
+ return 1;
+}
+
+/* Check for unspecified address ::0 */
+always_inline uword
+ip6_address_is_unspecified (ip6_address_t * a)
+{ return ip6_address_is_zero (a); }
+
+/* Check for loopback address ::1 */
+always_inline uword
+ip6_address_is_loopback (ip6_address_t * a)
+{
+ uword is_loopback;
+ u8 save = a->as_u8[15];
+ a->as_u8[15] = save ^ 1;
+ is_loopback = ip6_address_is_zero (a);
+ a->as_u8[15] = save;
+ return is_loopback;
+}
+
+/* Check for link local unicast fe80::/10. */
+always_inline uword
+ip6_address_is_link_local_unicast (ip6_address_t * a)
+{ return a->as_u8[0] == 0xfe && (a->as_u8[1] & 0xc0) == 0x80; }
+
+/* Check for unique local unicast fc00::/7. */
+always_inline uword
+ip6_address_is_local_unicast (ip6_address_t * a)
+{ return (a->as_u8[0] & 0xfe) == 0xfc; }
+
+/* Check for solicited node multicast 0xff02::1:ff00:0/104 */
+always_inline uword
+ip6_is_solicited_node_multicast_address (ip6_address_t * a)
+{
+ return (a->as_u32[0] == clib_host_to_net_u32 (0xff020000)
+ && a->as_u32[1] == 0
+ && a->as_u32[2] == clib_host_to_net_u32 (1)
+ && a->as_u8[12] == 0xff);
+}
+
+typedef struct {
+ /* 4 bit version, 8 bit traffic class and 20 bit flow label. */
+ u32 ip_version_traffic_class_and_flow_label;
+
+ /* Total packet length not including this header (but including
+ any extension headers if present). */
+ u16 payload_length;
+
+ /* Protocol for next header. */
+ u8 protocol;
+
+ /* Hop limit decremented by router at each hop. */
+ u8 hop_limit;
+
+ /* Source and destination address. */
+ ip6_address_t src_address, dst_address;
+} ip6_header_t;
+
+always_inline void *
+ip6_next_header (ip6_header_t * i)
+{ return (void *) (i + 1); }
+
+always_inline void
+ip6_tcp_reply_x1 (ip6_header_t * ip0, tcp_header_t * tcp0)
+{
+ {
+ ip6_address_t src0, dst0;
+
+ src0 = ip0->src_address;
+ dst0 = ip0->dst_address;
+ ip0->src_address = dst0;
+ ip0->dst_address = src0;
+ }
+
+ {
+ u16 src0, dst0;
+
+ src0 = tcp0->ports.src;
+ dst0 = tcp0->ports.dst;
+ tcp0->ports.src = dst0;
+ tcp0->ports.dst = src0;
+ }
+}
+
+always_inline void
+ip6_tcp_reply_x2 (ip6_header_t * ip0, ip6_header_t * ip1,
+ tcp_header_t * tcp0, tcp_header_t * tcp1)
+{
+ {
+ ip6_address_t src0, dst0, src1, dst1;
+
+ src0 = ip0->src_address;
+ src1 = ip1->src_address;
+ dst0 = ip0->dst_address;
+ dst1 = ip1->dst_address;
+ ip0->src_address = dst0;
+ ip1->src_address = dst1;
+ ip0->dst_address = src0;
+ ip1->dst_address = src1;
+ }
+
+ {
+ u16 src0, dst0, src1, dst1;
+
+ src0 = tcp0->ports.src;
+ src1 = tcp1->ports.src;
+ dst0 = tcp0->ports.dst;
+ dst1 = tcp1->ports.dst;
+ tcp0->ports.src = dst0;
+ tcp1->ports.src = dst1;
+ tcp0->ports.dst = src0;
+ tcp1->ports.dst = src1;
+ }
+}
+
+
+typedef CLIB_PACKED (struct {
+ u8 data;
+}) ip6_pad1_option_t;
+
+typedef CLIB_PACKED (struct {
+ u8 type;
+ u8 len;
+ u8 data[0];
+}) ip6_padN_option_t;
+
+typedef CLIB_PACKED (struct {
+#define IP6_MLDP_ALERT_TYPE 0x5
+ u8 type;
+ u8 len;
+ u16 value;
+}) ip6_router_alert_option_t;
+
+typedef CLIB_PACKED (struct {
+ u8 next_hdr;
+ /* Length of this header plus option data in 8 byte units. */
+ u8 n_data_u64s;
+ u8 data[0];
+}) ip6_hop_by_hop_ext_t;
+
+typedef CLIB_PACKED (struct {
+ u8 next_hdr;
+ u8 rsv;
+ u16 fragment_offset_and_more;
+ u32 identification;
+}) ip6_frag_hdr_t;
+
+#define ip6_frag_hdr_offset(hdr) \
+ (clib_net_to_host_u16((hdr)->fragment_offset_and_more) >> 3)
+
+#define ip6_frag_hdr_more(hdr) \
+ (clib_net_to_host_u16((hdr)->fragment_offset_and_more) & 0x1)
+
+#define ip6_frag_hdr_offset_and_more(offset, more) \
+ clib_host_to_net_u16(((offset) << 3) + !!(more))
+
+#endif /* included_ip6_packet_h */
diff --git a/vnet/vnet/ip/ip6_pg.c b/vnet/vnet/ip/ip6_pg.c
new file mode 100644
index 00000000000..2c3852765d4
--- /dev/null
+++ b/vnet/vnet/ip/ip6_pg.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip6_pg: IP v4 packet-generator interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+static void
+ip6_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_header_offset = g->start_byte_offset;
+
+ while (n_packets >= 2)
+ {
+ u32 pi0, pi1;
+ vlib_buffer_t * p0, * p1;
+ ip6_header_t * ip0, * ip1;
+
+ pi0 = packets[0];
+ pi1 = packets[1];
+ p0 = vlib_get_buffer (vm, pi0);
+ p1 = vlib_get_buffer (vm, pi1);
+ n_packets -= 2;
+ packets += 2;
+
+ ip0 = (void *) (p0->data + ip_header_offset);
+ ip1 = (void *) (p1->data + ip_header_offset);
+
+ ip0->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset - sizeof (ip0[0]));
+ ip1->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p1) - ip_header_offset - sizeof (ip1[0]));
+ }
+
+ while (n_packets >= 1)
+ {
+ u32 pi0;
+ vlib_buffer_t * p0;
+ ip6_header_t * ip0;
+
+ pi0 = packets[0];
+ p0 = vlib_get_buffer (vm, pi0);
+ n_packets -= 1;
+ packets += 1;
+
+ ip0 = (void *) (p0->data + ip_header_offset);
+
+ ip0->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset - sizeof (ip0[0]));
+ }
+}
+
+typedef struct {
+ pg_edit_t ip_version;
+ pg_edit_t traffic_class;
+ pg_edit_t flow_label;
+ pg_edit_t payload_length;
+ pg_edit_t protocol;
+ pg_edit_t hop_limit;
+ pg_edit_t src_address, dst_address;
+} pg_ip6_header_t;
+
+static inline void
+pg_ip6_header_init (pg_ip6_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, ip6_header_t, f);
+ _ (payload_length);
+ _ (hop_limit);
+ _ (protocol);
+ _ (src_address);
+ _ (dst_address);
+#undef _
+
+ /* Initialize bit fields. */
+ pg_edit_init_bitfield (&p->ip_version, ip6_header_t,
+ ip_version_traffic_class_and_flow_label,
+ 28, 4);
+ pg_edit_init_bitfield (&p->traffic_class, ip6_header_t,
+ ip_version_traffic_class_and_flow_label,
+ 20, 8);
+ pg_edit_init_bitfield (&p->flow_label, ip6_header_t,
+ ip_version_traffic_class_and_flow_label,
+ 0, 20);
+}
+
+uword
+unformat_pg_ip6_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_ip6_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ip6_header_t),
+ &group_index);
+ pg_ip6_header_init (p);
+
+ /* Defaults. */
+ pg_edit_set_fixed (&p->ip_version, 6);
+ pg_edit_set_fixed (&p->traffic_class, 0);
+ pg_edit_set_fixed (&p->flow_label, 0);
+ pg_edit_set_fixed (&p->hop_limit, 64);
+
+ p->payload_length.type = PG_EDIT_UNSPECIFIED;
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_pg_edit,
+ unformat_ip_protocol, &p->protocol,
+ unformat_pg_edit,
+ unformat_ip6_address, &p->src_address,
+ unformat_pg_edit,
+ unformat_ip6_address, &p->dst_address))
+ goto error;
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "version %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->ip_version))
+ ;
+
+ else if (unformat (input, "traffic-class %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->traffic_class))
+ ;
+
+ else if (unformat (input, "length %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->payload_length))
+ ;
+
+ else if (unformat (input, "hop-limit %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->hop_limit))
+ ;
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ {
+ ip_main_t * im = &ip_main;
+ ip_protocol_t protocol;
+ ip_protocol_info_t * pi;
+
+ pi = 0;
+ if (p->protocol.type == PG_EDIT_FIXED)
+ {
+ protocol = pg_edit_get_value (&p->protocol, PG_EDIT_LO);
+ pi = ip_get_protocol_info (im, protocol);
+ }
+
+ if (pi && pi->unformat_pg_edit
+ && unformat_user (input, pi->unformat_pg_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->payload_length.type == PG_EDIT_UNSPECIFIED
+ && s->min_packet_bytes == s->max_packet_bytes
+ && group_index + 1 < vec_len (s->edit_groups))
+ {
+ pg_edit_set_fixed (&p->payload_length,
+ pg_edit_group_n_bytes (s, group_index) - sizeof (ip6_header_t));
+ }
+
+ p = pg_get_edit_group (s, group_index);
+ if (p->payload_length.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = ip6_pg_edit_function;
+ }
+
+ return 1;
+ }
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
diff --git a/vnet/vnet/ip/ip_checksum.c b/vnet/vnet/ip/ip_checksum.c
new file mode 100644
index 00000000000..23e7889bc7e
--- /dev/null
+++ b/vnet/vnet/ip/ip_checksum.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip4/ip_checksum.c: ip/tcp/udp checksums
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+ip_csum_t
+ip_incremental_checksum (ip_csum_t sum, void * _data, uword n_bytes)
+{
+ uword data = pointer_to_uword (_data);
+ ip_csum_t sum0, sum1;
+
+ sum0 = 0;
+ sum1 = sum;
+
+ /* Align data pointer to 64 bits. */
+#define _(t) \
+do { \
+ if (n_bytes >= sizeof (t) \
+ && sizeof (t) < sizeof (ip_csum_t) \
+ && (data % (2 * sizeof (t))) != 0) \
+ { \
+ sum0 += * uword_to_pointer (data, t *); \
+ data += sizeof (t); \
+ n_bytes -= sizeof (t); \
+ } \
+} while (0)
+
+ _ (u8);
+ _ (u16);
+ if (BITS (ip_csum_t) > 32)
+ _ (u32);
+
+#undef _
+
+ {
+ ip_csum_t * d = uword_to_pointer (data, ip_csum_t *);
+
+ while (n_bytes >= 2 * sizeof (d[0]))
+ {
+ sum0 = ip_csum_with_carry (sum0, d[0]);
+ sum1 = ip_csum_with_carry (sum1, d[1]);
+ d += 2;
+ n_bytes -= 2 * sizeof (d[0]);
+ }
+
+ data = pointer_to_uword (d);
+ }
+
+#define _(t) \
+do { \
+ if (n_bytes >= sizeof (t) && sizeof (t) <= sizeof (ip_csum_t)) \
+ { \
+ sum0 = ip_csum_with_carry (sum0, * uword_to_pointer (data, t *)); \
+ data += sizeof (t); \
+ n_bytes -= sizeof (t); \
+ } \
+} while (0)
+
+ if (BITS (ip_csum_t) > 32)
+ _ (u64);
+ _ (u32);
+ _ (u16);
+ _ (u8);
+
+#undef _
+
+ /* Combine even and odd sums. */
+ sum0 = ip_csum_with_carry (sum0, sum1);
+
+ return sum0;
+}
+
+ip_csum_t
+ip_csum_and_memcpy (ip_csum_t sum, void * dst, void * src, uword n_bytes)
+{
+ uword n_left, n_left_odd;
+ ip_csum_t * dst_even, * src_even;
+ ip_csum_t sum0 = sum, sum1;
+
+ dst_even = uword_to_pointer
+ (pointer_to_uword (dst) &~ (sizeof (sum) - 1),
+ ip_csum_t *);
+ src_even = src;
+
+ n_left = n_bytes;
+ if ((n_left_odd = dst - (void *) dst_even))
+ {
+ u8 * d8 = dst, * s8 = src;
+ uword i, n_copy_odd;
+
+ n_copy_odd = clib_min (n_left, n_left_odd);
+
+ for (i = 0; i < n_copy_odd; i++)
+ d8[i] = s8[i];
+
+ if (n_copy_odd != n_left_odd)
+ return sum0;
+
+ sum0 = ip_csum_with_carry (sum0, dst_even[0]);
+ dst_even += 1;
+ src_even = (void *) (src + n_copy_odd);
+ n_left -= n_left_odd;
+ }
+
+ sum1 = 0;
+ while (n_left >= 2 * sizeof (dst_even[0]))
+ {
+ ip_csum_t dst0, dst1;
+
+ dst0 = clib_mem_unaligned (&src_even[0], ip_csum_t);
+ dst1 = clib_mem_unaligned (&src_even[1], ip_csum_t);
+
+ dst_even[0] = dst0;
+ dst_even[1] = dst1;
+
+ dst_even += 2;
+ src_even += 2;
+ n_left -= 2 * sizeof (dst_even[0]);
+
+ sum0 = ip_csum_with_carry (sum0, dst0);
+ sum1 = ip_csum_with_carry (sum1, dst1);
+ }
+
+ if (n_left >= 1 * sizeof (dst_even[0]))
+ {
+ ip_csum_t dst0;
+
+ dst0 = clib_mem_unaligned (&src_even[0], ip_csum_t);
+
+ dst_even[0] = dst0;
+
+ dst_even += 1;
+ src_even += 1;
+ n_left -= 1 * sizeof (dst_even[0]);
+
+ sum0 = ip_csum_with_carry (sum0, dst0);
+ }
+
+ if (n_left > 0)
+ {
+ u8 * d8 = dst, * s8 = src;
+ uword i;
+ for (i = 0; i < n_left; i++)
+ d8[i] = s8[i];
+ }
+
+ return ip_csum_with_carry (sum0, sum1);
+}
diff --git a/vnet/vnet/ip/ip_frag.c b/vnet/vnet/ip/ip_frag.c
new file mode 100644
index 00000000000..22176187a9c
--- /dev/null
+++ b/vnet/vnet/ip/ip_frag.c
@@ -0,0 +1,449 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+/*
+ * IPv4 Fragmentation Node
+ *
+ *
+ */
+
+#include "ip_frag.h"
+
+#include <vnet/ip/ip.h>
+
+
+typedef struct {
+ u8 ipv6;
+ u16 header_offset;
+ u16 mtu;
+ u8 next;
+ u16 n_fragments;
+} ip_frag_trace_t;
+
+static u8 * format_ip_frag_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip_frag_trace_t * t = va_arg (*args, ip_frag_trace_t *);
+ s = format(s, "IPv%s offset: %u mtu: %u fragments: %u next: %s",
+ t->ipv6?"6":"4",
+ t->header_offset, t->mtu, t->n_fragments, node->next_node_names[t->next]);
+ return s;
+}
+
+static u32 running_fragment_id;
+
+static void
+ip4_frag_do_fragment(vlib_main_t *vm, u32 pi, u32 **buffer, ip_frag_error_t *error)
+{
+ vlib_buffer_t *p;
+ ip4_header_t *ip4;
+ u16 mtu, ptr, len, max, rem,
+ offset, ip_frag_id, ip_frag_offset;
+ u8 *packet, more;
+
+ vec_add1(*buffer, pi);
+ p = vlib_get_buffer(vm, pi);
+ offset = vnet_buffer(p)->ip_frag.header_offset;
+ mtu = vnet_buffer(p)->ip_frag.mtu;
+ packet = (u8 *)vlib_buffer_get_current(p);
+ ip4 = (ip4_header_t *)(packet + offset);
+
+ rem = clib_net_to_host_u16(ip4->length) - sizeof(*ip4);
+ ptr = 0;
+ max = (mtu - sizeof(*ip4) - vnet_buffer(p)->ip_frag.header_offset) & ~0x7;
+
+ if (rem < (p->current_length - offset - sizeof(*ip4))) {
+ *error = IP_FRAG_ERROR_MALFORMED;
+ return;
+ }
+
+ if (mtu < sizeof(*ip4)) {
+ *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER;
+ return;
+ }
+
+ if (ip4->flags_and_fragment_offset &
+ clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT)) {
+ *error = IP_FRAG_ERROR_DONT_FRAGMENT_SET;
+ return;
+ }
+
+ if (ip4_is_fragment(ip4)) {
+ ip_frag_id = ip4->fragment_id;
+ ip_frag_offset = ip4_get_fragment_offset(ip4);
+ more = !!(ip4->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS));
+ } else {
+ ip_frag_id = (++running_fragment_id);
+ ip_frag_offset = 0;
+ more = 0;
+ }
+
+ //Do the actual fragmentation
+ while (rem) {
+ u32 bi;
+ vlib_buffer_t *b;
+ ip4_header_t *fip4;
+
+ len = (rem > (mtu - sizeof(*ip4) - vnet_buffer(p)->ip_frag.header_offset)) ? max : rem;
+
+ if (ptr == 0) {
+ bi = pi;
+ b = p;
+ fip4 = (ip4_header_t *)(vlib_buffer_get_current(b) + offset);
+ } else {
+ if (!vlib_buffer_alloc(vm, &bi, 1)) {
+ *error = IP_FRAG_ERROR_MEMORY;
+ return;
+ }
+ vec_add1(*buffer, bi);
+ b = vlib_get_buffer(vm, bi);
+ vnet_buffer(b)->sw_if_index[VLIB_RX] = vnet_buffer(p)->sw_if_index[VLIB_RX];
+ vnet_buffer(b)->sw_if_index[VLIB_TX] = vnet_buffer(p)->sw_if_index[VLIB_TX];
+ fip4 = (ip4_header_t *)(vlib_buffer_get_current(b) + offset);
+
+ //Copy offset and ip4 header
+ memcpy(b->data, packet, offset + sizeof(*ip4));
+ //Copy data
+ memcpy(((u8*)(fip4)) + sizeof(*fip4),
+ packet + offset + sizeof(*fip4) + ptr, len);
+ }
+ b->current_length = offset + len + sizeof(*fip4);
+
+ fip4->fragment_id = ip_frag_id;
+ fip4->flags_and_fragment_offset = clib_host_to_net_u16((ptr >> 3) + ip_frag_offset);
+ fip4->flags_and_fragment_offset |= clib_host_to_net_u16(((len != rem) || more) << 13);
+ // ((len0 != rem0) || more0) << 13 is optimization for
+ // ((len0 != rem0) || more0) ? IP4_HEADER_FLAG_MORE_FRAGMENTS : 0
+ fip4->length = clib_host_to_net_u16(len + sizeof(*fip4));
+ fip4->checksum = ip4_header_checksum(fip4);
+
+ if(vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER) {
+ //Encapsulating ipv4 header
+ ip4_header_t *encap_header4 = (ip4_header_t *)vlib_buffer_get_current(b);
+ encap_header4->length = clib_host_to_net_u16(b->current_length);
+ encap_header4->checksum = ip4_header_checksum(encap_header4);
+ } else if (vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER) {
+ //Encapsulating ipv6 header
+ ip6_header_t *encap_header6 = (ip6_header_t *)vlib_buffer_get_current(b);
+ encap_header6->payload_length = clib_host_to_net_u16(b->current_length - sizeof(*encap_header6));
+ }
+
+ rem -= len;
+ ptr += len;
+ }
+}
+
+
+static uword
+ip4_frag (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime(vm, ip4_frag_node.index);
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ u32 frag_sent = 0, small_packets = 0;
+ u32 *buffer = 0;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0, *frag_from, frag_left;
+ vlib_buffer_t *p0;
+ ip_frag_error_t error0;
+ ip4_frag_next_t next0;
+
+ //Note: The packet is not enqueued now.
+ //It is instead put in a vector where other fragments
+ //will be put as well.
+ pi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ error0 = IP_FRAG_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip4_frag_do_fragment(vm, pi0, &buffer, &error0);
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ip_frag_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof (*tr));
+ tr->header_offset = vnet_buffer(p0)->ip_frag.header_offset;
+ tr->mtu = vnet_buffer(p0)->ip_frag.mtu;
+ tr->ipv6 = 0;
+ tr->n_fragments = vec_len(buffer);
+ tr->next = vnet_buffer(p0)->ip_frag.next_index;
+ }
+
+ next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP4_FRAG_NEXT_DROP;
+ frag_sent += vec_len(buffer);
+ small_packets += (vec_len(buffer) == 1);
+
+ //Send fragments that were added in the frame
+ frag_from = buffer;
+ frag_left = vec_len(buffer);
+ while (frag_left > 0) {
+ while (frag_left > 0 && n_left_to_next > 0) {
+ u32 i;
+ i = to_next[0] = frag_from[0];
+ frag_from += 1;
+ frag_left -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ vlib_get_buffer(vm, i)->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, i,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+ }
+ vec_reset_length(buffer);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ vec_free(buffer);
+ vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_FRAGMENT_SENT, frag_sent);
+ vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_SMALL_PACKET, small_packets);
+
+ return frame->n_vectors;
+}
+
+
+static void
+ip6_frag_do_fragment(vlib_main_t *vm, u32 pi, u32 **buffer, ip_frag_error_t *error)
+{
+ vlib_buffer_t *p;
+ ip6_header_t *ip6_hdr;
+ ip6_frag_hdr_t *frag_hdr;
+ u8 *payload, *next_header;
+
+ p = vlib_get_buffer(vm, pi);
+
+ //Parsing the IPv6 headers
+ ip6_hdr = vlib_buffer_get_current(p) + vnet_buffer(p)->ip_frag.header_offset;
+ payload = (u8 *)(ip6_hdr + 1);
+ next_header = &ip6_hdr->protocol;
+ if (*next_header == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) {
+ next_header = payload;
+ payload += payload[1] * 8;
+ }
+
+ if (*next_header == IP_PROTOCOL_IP6_DESTINATION_OPTIONS) {
+ next_header = payload;
+ payload += payload[1] * 8;
+ }
+
+ if (*next_header == IP_PROTOCOL_IPV6_ROUTE) {
+ next_header = payload;
+ payload += payload[1] * 8;
+ }
+
+ u8 has_more;
+ u16 initial_offset;
+ if (*next_header == IP_PROTOCOL_IPV6_FRAGMENTATION) {
+ //The fragmentation header is already there
+ frag_hdr = (ip6_frag_hdr_t *)payload;
+ has_more = ip6_frag_hdr_more(frag_hdr);
+ initial_offset = ip6_frag_hdr_offset(frag_hdr);
+ } else {
+ //Insert a fragmentation header in the packet
+ u8 nh = *next_header;
+ *next_header = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ vlib_buffer_advance(p, -sizeof(*frag_hdr));
+ u8 *start = vlib_buffer_get_current(p);
+ memmove(start, start + sizeof(*frag_hdr), payload - (start + sizeof(*frag_hdr)));
+ frag_hdr = (ip6_frag_hdr_t *)(payload - sizeof(*frag_hdr));
+ frag_hdr->identification = ++running_fragment_id;
+ frag_hdr->next_hdr = nh;
+ frag_hdr->rsv = 0;
+ has_more = 0;
+ initial_offset = 0;
+ }
+ payload = (u8 *)(frag_hdr + 1);
+
+ u16 headers_len = payload - (u8 *)vlib_buffer_get_current(p);
+ u16 max_payload = vnet_buffer(p)->ip_frag.mtu - headers_len;
+ u16 rem = p->current_length - headers_len;
+ u16 ptr = 0;
+
+ if(max_payload < 8) {
+ *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER;
+ return;
+ }
+
+ while (rem) {
+ u32 bi;
+ vlib_buffer_t *b;
+ u16 len = (rem > max_payload)?(max_payload & ~0x7):rem;
+ rem -= len;
+
+ if (ptr != 0) {
+ if (!vlib_buffer_alloc(vm, &bi, 1)) {
+ *error = IP_FRAG_ERROR_MEMORY;
+ return;
+ }
+ b = vlib_get_buffer(vm, bi);
+ vnet_buffer(b)->sw_if_index[VLIB_RX] = vnet_buffer(p)->sw_if_index[VLIB_RX];
+ vnet_buffer(b)->sw_if_index[VLIB_TX] = vnet_buffer(p)->sw_if_index[VLIB_TX];
+ memcpy(vlib_buffer_get_current(b), vlib_buffer_get_current(p), headers_len);
+ memcpy(vlib_buffer_get_current(b) + headers_len, payload + ptr, len);
+ frag_hdr = vlib_buffer_get_current(b) + headers_len - sizeof(*frag_hdr);
+ } else {
+ bi = pi;
+ b = vlib_get_buffer(vm, bi);
+ //frag_hdr already set here
+ }
+
+ ip6_hdr = vlib_buffer_get_current(b) + vnet_buffer(p)->ip_frag.header_offset;
+ frag_hdr->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(initial_offset + (ptr >> 3), (rem || has_more));
+ b->current_length = headers_len + len;
+ ip6_hdr->payload_length = clib_host_to_net_u16(b->current_length - vnet_buffer(p)->ip_frag.header_offset - sizeof(*ip6_hdr));
+
+ if(vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER) {
+ //Encapsulating ipv4 header
+ ip4_header_t *encap_header4 = (ip4_header_t *)vlib_buffer_get_current(b);
+ encap_header4->length = clib_host_to_net_u16(b->current_length);
+ encap_header4->checksum = ip4_header_checksum(encap_header4);
+ } else if (vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER) {
+ //Encapsulating ipv6 header
+ ip6_header_t *encap_header6 = (ip6_header_t *)vlib_buffer_get_current(b);
+ encap_header6->payload_length = clib_host_to_net_u16(b->current_length - sizeof(*encap_header6));
+ }
+
+ vec_add1(*buffer, bi);
+
+ ptr += len;
+ }
+}
+
+static uword
+ip6_frag (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_frag_node.index);
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ u32 frag_sent = 0, small_packets = 0;
+ u32 *buffer = 0;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0, *frag_from, frag_left;
+ vlib_buffer_t * p0;
+ ip_frag_error_t error0;
+ ip6_frag_next_t next0;
+
+ pi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ error0 = IP_FRAG_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip6_frag_do_fragment(vm, pi0, &buffer, &error0);
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ip_frag_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof (*tr));
+ tr->header_offset = vnet_buffer(p0)->ip_frag.header_offset;
+ tr->mtu = vnet_buffer(p0)->ip_frag.mtu;
+ tr->ipv6 = 1;
+ tr->n_fragments = vec_len(buffer);
+ tr->next = vnet_buffer(p0)->ip_frag.next_index;
+ }
+
+ next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP6_FRAG_NEXT_DROP;
+ frag_sent += vec_len(buffer);
+ small_packets += (vec_len(buffer) == 1);
+
+ //Send fragments that were added in the frame
+ frag_from = buffer;
+ frag_left = vec_len(buffer);
+ while (frag_left > 0) {
+ while (frag_left > 0 && n_left_to_next > 0) {
+ u32 i;
+ i = to_next[0] = frag_from[0];
+ frag_from += 1;
+ frag_left -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ vlib_get_buffer(vm, i)->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, i,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+ }
+ vec_reset_length(buffer);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ vec_free(buffer);
+ vlib_node_increment_counter(vm, ip6_frag_node.index, IP_FRAG_ERROR_FRAGMENT_SENT, frag_sent);
+ vlib_node_increment_counter(vm, ip6_frag_node.index, IP_FRAG_ERROR_SMALL_PACKET, small_packets);
+
+ return frame->n_vectors;
+}
+
+static char * ip4_frag_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip_frag_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (ip4_frag_node) = {
+ .function = ip4_frag,
+ .name = IP4_FRAG_NODE_NAME,
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_frag_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = IP_FRAG_N_ERROR,
+ .error_strings = ip4_frag_error_strings,
+
+ .n_next_nodes = IP4_FRAG_N_NEXT,
+ .next_nodes = {
+ [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_FRAG_NEXT_DROP] = "error-drop"
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_frag_node) = {
+ .function = ip6_frag,
+ .name = IP6_FRAG_NODE_NAME,
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_frag_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = IP_FRAG_N_ERROR,
+ .error_strings = ip4_frag_error_strings,
+
+ .n_next_nodes = IP6_FRAG_N_NEXT,
+ .next_nodes = {
+ [IP6_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP6_FRAG_NEXT_DROP] = "error-drop"
+ },
+};
diff --git a/vnet/vnet/ip/ip_frag.h b/vnet/vnet/ip/ip_frag.h
new file mode 100644
index 00000000000..04566904e5f
--- /dev/null
+++ b/vnet/vnet/ip/ip_frag.h
@@ -0,0 +1,81 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+/*
+ * IPv4 and IPv6 Fragmentation Nodes
+ *
+ * A packet sent to those nodes require the following
+ * buffer attributes to be set:
+ * ip_frag.header_offset :
+ * Where to find the IPv4 (or IPv6) header in the packet. Previous
+ * bytes are left untouched and copied in every fragment. The fragments
+ * are then appended. This option is used for fragmented packets
+ * that are encapsulated.
+ * ip_frag.mtu :
+ * Maximum size of IP packets, header included, but ignoring
+ * the 'ip_frag.header_offset' copied bytes.
+ * ip_frag.next_index :
+ * One of ip_frag_next_t, indicating to which exit node the fragments
+ * should be sent to.
+ *
+ */
+
+#ifndef IP_FRAG_H
+#define IP_FRAG_H
+
+#include <vnet/vnet.h>
+
+#define IP_FRAG_FLAG_IP4_HEADER 0x01 //Encapsulating IPv4 header
+#define IP_FRAG_FLAG_IP6_HEADER 0x02 //Encapsulating IPv6 header
+
+#define IP4_FRAG_NODE_NAME "ip4-frag"
+#define IP6_FRAG_NODE_NAME "ip6-frag"
+
+vlib_node_registration_t ip4_frag_node;
+vlib_node_registration_t ip6_frag_node;
+
+typedef enum {
+ IP4_FRAG_NEXT_IP4_LOOKUP,
+ IP4_FRAG_NEXT_IP6_LOOKUP,
+ IP4_FRAG_NEXT_DROP,
+ IP4_FRAG_N_NEXT
+} ip4_frag_next_t;
+
+typedef enum {
+ IP6_FRAG_NEXT_IP4_LOOKUP,
+ IP6_FRAG_NEXT_IP6_LOOKUP,
+ IP6_FRAG_NEXT_DROP,
+ IP6_FRAG_N_NEXT
+} ip6_frag_next_t;
+
+#define foreach_ip_frag_error \
+ /* Must be first. */ \
+ _(NONE, "packet fragmented") \
+ _(SMALL_PACKET, "packet smaller than MTU") \
+ _(FRAGMENT_SENT, "number of sent fragments") \
+ _(CANT_FRAGMENT_HEADER, "can't fragment header'") \
+ _(DONT_FRAGMENT_SET, "can't fragment this packet'") \
+ _(MALFORMED, "malformed packet") \
+ _(MEMORY, "could not allocate buffer") \
+ _(UNKNOWN, "unknown error")
+
+typedef enum {
+#define _(sym,str) IP_FRAG_ERROR_##sym,
+ foreach_ip_frag_error
+#undef _
+ IP_FRAG_N_ERROR,
+ } ip_frag_error_t;
+
+#endif /* ifndef IP_FRAG_H */
diff --git a/vnet/vnet/ip/ip_init.c b/vnet/vnet/ip/ip_init.c
new file mode 100644
index 00000000000..0654daa7685
--- /dev/null
+++ b/vnet/vnet/ip/ip_init.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip_init.c: ip generic initialization
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+ip_main_t ip_main;
+
+clib_error_t *
+ip_main_init (vlib_main_t * vm)
+{
+ ip_main_t * im = &ip_main;
+ clib_error_t * error = 0;
+
+ memset (im, 0, sizeof (im[0]));
+
+ {
+ ip_protocol_info_t * pi;
+ u32 i;
+
+#define ip_protocol(n,s) \
+do { \
+ vec_add2 (im->protocol_infos, pi, 1); \
+ pi->protocol = n; \
+ pi->name = (u8 *) #s; \
+} while (0);
+
+#include "protocols.def"
+
+#undef ip_protocol
+
+ im->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ for (i = 0; i < vec_len (im->protocol_infos); i++)
+ {
+ pi = im->protocol_infos + i;
+
+ hash_set_mem (im->protocol_info_by_name, pi->name, i);
+ hash_set (im->protocol_info_by_protocol, pi->protocol, i);
+ }
+ }
+
+ {
+ tcp_udp_port_info_t * pi;
+ u32 i;
+ static char * port_names[] =
+ {
+#define ip_port(s,n) #s,
+#include "ports.def"
+#undef ip_port
+ };
+ static u16 ports[] =
+ {
+#define ip_port(s,n) n,
+#include "ports.def"
+#undef ip_port
+ };
+
+ vec_resize (im->port_infos, ARRAY_LEN (port_names));
+ im->port_info_by_name = hash_create_string (0, sizeof (uword));
+
+ for (i = 0; i < vec_len (im->port_infos); i++)
+ {
+ pi = im->port_infos + i;
+ pi->port = clib_host_to_net_u16 (ports[i]);
+ pi->name = (u8 *) port_names[i];
+ hash_set_mem (im->port_info_by_name, pi->name, i);
+ hash_set (im->port_info_by_port, pi->port, i);
+ }
+ }
+
+ if ((error = vlib_call_init_function (vm, vnet_main_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip4_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip6_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, icmp4_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, icmp6_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip6_hop_by_hop_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip4_hop_by_hop_init)))
+ return error;
+
+#if 0
+ if ((error = vlib_call_init_function (vm, tcp_udp_lookup_init)))
+ return error;
+
+#endif
+
+ if ((error = vlib_call_init_function (vm, udp_local_init)))
+ return error;
+
+#if 0
+ if ((error = vlib_call_init_function (vm, tcp_init)))
+ return error;
+#endif
+
+ if ((error = vlib_call_init_function (vm, udp_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip_classify_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, input_acl_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (ip_main_init);
diff --git a/vnet/vnet/ip/ip_input_acl.c b/vnet/vnet/ip/ip_input_acl.c
new file mode 100644
index 00000000000..75aa9ef818f
--- /dev/null
+++ b/vnet/vnet/ip/ip_input_acl.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/ip/ip.h>
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/classify/input_acl.h>
+
+typedef struct {
+ u32 sw_if_index;
+ u32 next_index;
+ u32 table_index;
+ u32 offset;
+} ip_inacl_trace_t;
+
+/* packet trace format function */
+static u8 * format_ip_inacl_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ip_inacl_trace_t * t = va_arg (*args, ip_inacl_trace_t *);
+
+ s = format (s, "INACL: sw_if_index %d, next_index %d, table %d, offset %d",
+ t->sw_if_index, t->next_index, t->table_index, t->offset);
+ return s;
+}
+
+vlib_node_registration_t ip4_inacl_node;
+vlib_node_registration_t ip6_inacl_node;
+
+#define foreach_ip_inacl_error \
+_(MISS, "input ACL misses") \
+_(HIT, "input ACL hits") \
+_(CHAIN_HIT, "input ACL hits after chain walk")
+
+typedef enum {
+#define _(sym,str) IP_INACL_ERROR_##sym,
+ foreach_ip_inacl_error
+#undef _
+ IP_INACL_N_ERROR,
+} ip_inacl_error_t;
+
+static char * ip_inacl_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ip_inacl_error
+#undef _
+};
+
+static inline uword
+ip_inacl_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame, int is_ip4)
+{
+ u32 n_left_from, * from, * to_next;
+ acl_next_index_t next_index;
+ input_acl_main_t * am = &input_acl_main;
+ vnet_classify_main_t * vcm = am->vnet_classify_main;
+ f64 now = vlib_time_now (vm);
+ u32 hits = 0;
+ u32 misses = 0;
+ u32 chain_hits = 0;
+ input_acl_table_id_t tid;
+ vlib_node_runtime_t * error_node;
+
+ if (is_ip4)
+ {
+ tid = INPUT_ACL_TABLE_IP4;
+ error_node = vlib_node_get_runtime (vm, ip4_input_node.index);
+ }
+ else
+ {
+ tid = INPUT_ACL_TABLE_IP6;
+ error_node = vlib_node_get_runtime (vm, ip6_input_node.index);
+ }
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ /* First pass: compute hashes */
+
+ while (n_left_from > 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, bi1;
+ u8 * h0, * h1;
+ u32 sw_if_index0, sw_if_index1;
+ u32 table_index0, table_index1;
+ vnet_classify_table_t * t0, * t1;
+
+ /* prefetch next iteration */
+ {
+ vlib_buffer_t * p1, * p2;
+
+ p1 = vlib_get_buffer (vm, from[1]);
+ p2 = vlib_get_buffer (vm, from[2]);
+
+ vlib_prefetch_buffer_header (p1, STORE);
+ CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE);
+ vlib_prefetch_buffer_header (p2, STORE);
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ bi1 = from[1];
+ b1 = vlib_get_buffer (vm, bi1);
+ h1 = b1->data;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0];
+
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+ table_index1 = am->classify_table_index_by_sw_if_index[tid][sw_if_index1];
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ t1 = pool_elt_at_index (vcm->tables, table_index1);
+
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ vnet_buffer(b1)->l2_classify.hash =
+ vnet_classify_hash_packet (t1, (u8 *) h1);
+
+ vnet_classify_prefetch_bucket (t1, vnet_buffer(b1)->l2_classify.hash);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+
+ vnet_buffer(b1)->l2_classify.table_index = table_index1;
+
+ from += 2;
+ n_left_from -= 2;
+ }
+
+ while (n_left_from > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0;
+ u8 * h0;
+ u32 sw_if_index0;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0];
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ from++;
+ n_left_from--;
+ }
+
+ next_index = node->cached_next_index;
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ /* Not enough load/store slots to dual loop... */
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = ACL_NEXT_INDEX_DENY;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+ vnet_classify_entry_t * e0;
+ u64 hash0;
+ u8 * h0;
+ u8 error0;
+
+ /* Stride 3 seems to work best */
+ if (PREDICT_TRUE (n_left_from > 3))
+ {
+ vlib_buffer_t * p1 = vlib_get_buffer(vm, from[3]);
+ vnet_classify_table_t * tp1;
+ u32 table_index1;
+ u64 phash1;
+
+ table_index1 = vnet_buffer(p1)->l2_classify.table_index;
+
+ if (PREDICT_TRUE (table_index1 != ~0))
+ {
+ tp1 = pool_elt_at_index (vcm->tables, table_index1);
+ phash1 = vnet_buffer(p1)->l2_classify.hash;
+ vnet_classify_prefetch_entry (tp1, phash1);
+ }
+ }
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+ table_index0 = vnet_buffer(b0)->l2_classify.table_index;
+ e0 = 0;
+ t0 = 0;
+
+ vnet_get_config_data (am->vnet_config_main[tid],
+ &vnet_buffer(b0)->ip.current_config_index,
+ &next0,
+ /* # bytes of config data */ 0);
+
+ if (PREDICT_TRUE(table_index0 != ~0))
+ {
+ hash0 = vnet_buffer(b0)->l2_classify.hash;
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0,
+ now);
+ if (e0)
+ {
+ vlib_buffer_advance (b0, e0->advance);
+
+ next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)?
+ e0->next_index:next0;
+
+ hits++;
+
+ if (is_ip4)
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP4_ERROR_INACL_SESSION_DENY:IP4_ERROR_NONE;
+ else
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP6_ERROR_INACL_SESSION_DENY:IP6_ERROR_NONE;
+ b0->error = error_node->errors[error0];
+ }
+ else
+ {
+ while (1)
+ {
+ if (PREDICT_TRUE(t0->next_table_index != ~0))
+ t0 = pool_elt_at_index (vcm->tables,
+ t0->next_table_index);
+ else
+ {
+ next0 = (t0->miss_next_index < ACL_NEXT_INDEX_N_NEXT)?
+ t0->miss_next_index:next0;
+
+ misses++;
+
+ if (is_ip4)
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP4_ERROR_INACL_TABLE_MISS:IP4_ERROR_NONE;
+ else
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP6_ERROR_INACL_TABLE_MISS:IP6_ERROR_NONE;
+ b0->error = error_node->errors[error0];
+ break;
+ }
+
+ hash0 = vnet_classify_hash_packet (t0, (u8 *) h0);
+ e0 = vnet_classify_find_entry
+ (t0, (u8 *) h0, hash0, now);
+ if (e0)
+ {
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ chain_hits++;
+
+ if (is_ip4)
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP4_ERROR_INACL_SESSION_DENY:IP4_ERROR_NONE;
+ else
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ IP6_ERROR_INACL_SESSION_DENY:IP6_ERROR_NONE;
+ b0->error = error_node->errors[error0];
+ break;
+ }
+ }
+ }
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ip_inacl_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->next_index = next0;
+ t->table_index = t0 ? t0 - vcm->tables : ~0;
+ t->offset = e0 ? vnet_classify_get_offset (t0, e0): ~0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_INACL_ERROR_MISS,
+ misses);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_INACL_ERROR_HIT,
+ hits);
+ vlib_node_increment_counter (vm, node->node_index,
+ IP_INACL_ERROR_CHAIN_HIT,
+ chain_hits);
+ return frame->n_vectors;
+}
+
+static uword
+ip4_inacl (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip_inacl_inline (vm, node, frame, 1 /* is_ip4 */);
+}
+
+
+VLIB_REGISTER_NODE (ip4_inacl_node) = {
+ .function = ip4_inacl,
+ .name = "ip4-inacl",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_inacl_trace,
+ .n_errors = ARRAY_LEN(ip_inacl_error_strings),
+ .error_strings = ip_inacl_error_strings,
+
+ .n_next_nodes = ACL_NEXT_INDEX_N_NEXT,
+ .next_nodes = {
+ [ACL_NEXT_INDEX_DENY] = "error-drop",
+ },
+};
+
+static uword
+ip6_inacl (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return ip_inacl_inline (vm, node, frame, 0 /* is_ip4 */);
+}
+
+
+VLIB_REGISTER_NODE (ip6_inacl_node) = {
+ .function = ip6_inacl,
+ .name = "ip6-inacl",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ip_inacl_trace,
+ .n_errors = ARRAY_LEN(ip_inacl_error_strings),
+ .error_strings = ip_inacl_error_strings,
+
+ .n_next_nodes = ACL_NEXT_INDEX_N_NEXT,
+ .next_nodes = {
+ [ACL_NEXT_INDEX_DENY] = "error-drop",
+ },
+};
+
+static clib_error_t *
+ip_inacl_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ip_inacl_init);
+
diff --git a/vnet/vnet/ip/ip_packet.h b/vnet/vnet/ip/ip_packet.h
new file mode 100644
index 00000000000..fb9a23604e1
--- /dev/null
+++ b/vnet/vnet/ip/ip_packet.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip_packet.h: packet format common between ip4 & ip6
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_packet_h
+#define included_ip_packet_h
+
+#include <vppinfra/byte_order.h>
+#include <vppinfra/error.h>
+
+typedef enum ip_protocol {
+#define ip_protocol(n,s) IP_PROTOCOL_##s = n,
+#include "protocols.def"
+#undef ip_protocol
+} ip_protocol_t;
+
+/* TCP/UDP ports. */
+typedef enum {
+#define ip_port(s,n) IP_PORT_##s = n,
+#include "ports.def"
+#undef ip_port
+} ip_port_t;
+
+/* Classifies protocols into TCP, UDP, ICMP or other. */
+typedef enum {
+ IP_BUILTIN_PROTOCOL_UDP,
+ IP_BUILTIN_PROTOCOL_TCP,
+ IP_BUILTIN_PROTOCOL_ICMP,
+ IP_BUILTIN_PROTOCOL_UNKNOWN,
+} ip_builtin_protocol_t;
+
+#define foreach_ip_builtin_multicast_group \
+ _ (1, all_hosts_on_subnet) \
+ _ (2, all_routers_on_subnet) \
+ _ (4, dvmrp) \
+ _ (5, ospf_all_routers) \
+ _ (6, ospf_designated_routers) \
+ _ (13, pim) \
+ _ (18, vrrp) \
+ _ (102, hsrp) \
+ _ (22, igmp_v3)
+
+typedef enum {
+#define _(n,f) IP_MULTICAST_GROUP_##f = n,
+ foreach_ip_builtin_multicast_group
+#undef _
+} ip_multicast_group_t;
+
+/* IP checksum support. */
+
+/* Incremental checksum update. */
+typedef uword ip_csum_t;
+
+always_inline ip_csum_t
+ip_csum_with_carry (ip_csum_t sum, ip_csum_t x)
+{
+ ip_csum_t t = sum + x;
+ return t + (t < x);
+}
+
+/* Update checksum changing field at even byte offset from x -> 0. */
+always_inline ip_csum_t
+ip_csum_add_even (ip_csum_t c, ip_csum_t x)
+{
+ ip_csum_t d;
+
+ d = c - x;
+
+ /* Fold in carry from high bit. */
+ d -= d > c;
+
+ ASSERT (ip_csum_with_carry (d, x) == c);
+
+ return d;
+}
+
+/* Update checksum changing field at even byte offset from 0 -> x. */
+always_inline ip_csum_t
+ip_csum_sub_even (ip_csum_t c, ip_csum_t x)
+{ return ip_csum_with_carry (c, x); }
+
+always_inline ip_csum_t
+ip_csum_update_inline (ip_csum_t sum, ip_csum_t old, ip_csum_t new,
+ u32 field_byte_offset, u32 field_n_bytes)
+{
+ /* For even 1-byte fields on big-endian and odd 1-byte fields on little endian
+ we need to shift byte into place for checksum. */
+ if ((field_n_bytes % 2)
+ && (field_byte_offset % 2) == CLIB_ARCH_IS_LITTLE_ENDIAN)
+ {
+ old = old << 8;
+ new = new << 8;
+ }
+ sum = ip_csum_sub_even (sum, old);
+ sum = ip_csum_add_even (sum, new);
+ return sum;
+}
+
+#define ip_csum_update(sum,old,new,type,field) \
+ ip_csum_update_inline ((sum), (old), (new), \
+ STRUCT_OFFSET_OF (type, field), \
+ STRUCT_SIZE_OF (type, field))
+
+always_inline u16 ip_csum_fold (ip_csum_t c)
+{
+ /* Reduce to 16 bits. */
+#if uword_bits == 64
+ c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32);
+ c = (c & 0xffff) + (c >> 16);
+#endif
+
+ c = (c & 0xffff) + (c >> 16);
+ c = (c & 0xffff) + (c >> 16);
+
+ return c;
+}
+
+/* Copy data and checksum at the same time. */
+ip_csum_t ip_csum_and_memcpy (ip_csum_t sum, void * dst, void * src, uword n_bytes);
+
+always_inline u16
+ip_csum_and_memcpy_fold (ip_csum_t sum, void * dst)
+{
+ uword n_zero;
+ ip_csum_t * dst_even;
+
+ dst_even = uword_to_pointer
+ (pointer_to_uword (dst) &~ (sizeof (sum) - 1),
+ ip_csum_t *);
+
+ if ((n_zero = dst - (void *) dst_even))
+ {
+ u8 * d8 = dst;
+ uword i;
+
+ for (i = 0; i < n_zero; i++)
+ d8[i] = 0;
+
+ sum = ip_csum_with_carry (sum, dst_even[0]);
+ }
+
+ return ip_csum_fold (sum);
+}
+
+/* Checksum routine. */
+ip_csum_t ip_incremental_checksum (ip_csum_t sum, void * data, uword n_bytes);
+
+#endif /* included_ip_packet_h */
diff --git a/vnet/vnet/ip/lookup.c b/vnet/vnet/ip/lookup.c
new file mode 100644
index 00000000000..80f0a33e731
--- /dev/null
+++ b/vnet/vnet/ip/lookup.c
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip_lookup.c: ip4/6 adjacency and lookup table managment
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vppinfra/math.h> /* for fabs */
+#include <vnet/ip/ip.h>
+
+static void
+ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index);
+
+always_inline void
+ip_poison_adjacencies (ip_adjacency_t * adj, uword n_adj)
+{
+ if (CLIB_DEBUG > 0)
+ memset (adj, 0xfe, n_adj * sizeof (adj[0]));
+}
+
+/* Create new block of given number of contiguous adjacencies. */
+ip_adjacency_t *
+ip_add_adjacency (ip_lookup_main_t * lm,
+ ip_adjacency_t * copy_adj,
+ u32 n_adj,
+ u32 * adj_index_return)
+{
+ ip_adjacency_t * adj;
+ u32 ai, i, handle;
+
+ ai = heap_alloc (lm->adjacency_heap, n_adj, handle);
+ adj = heap_elt_at_index (lm->adjacency_heap, ai);
+
+ ip_poison_adjacencies (adj, n_adj);
+
+ /* Validate adjacency counters. */
+ vlib_validate_combined_counter (&lm->adjacency_counters, ai + n_adj - 1);
+
+ for (i = 0; i < n_adj; i++)
+ {
+ /* Make sure certain fields are always initialized. */
+ adj[i].rewrite_header.sw_if_index = ~0;
+ adj[i].explicit_fib_index = ~0;
+ adj[i].mcast_group_index = ~0;
+ adj[i].classify_table_index = ~0;
+ adj[i].saved_lookup_next_index = 0;
+
+ if (copy_adj)
+ adj[i] = copy_adj[i];
+
+ adj[i].heap_handle = handle;
+ adj[i].n_adj = n_adj;
+
+ /* Zero possibly stale counters for re-used adjacencies. */
+ vlib_zero_combined_counter (&lm->adjacency_counters, ai + i);
+ }
+
+ *adj_index_return = ai;
+ return adj;
+}
+
+static void ip_del_adjacency2 (ip_lookup_main_t * lm, u32 adj_index, u32 delete_multipath_adjacency)
+{
+ ip_adjacency_t * adj;
+ uword handle;
+
+ ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 1);
+
+ adj = ip_get_adjacency (lm, adj_index);
+ handle = adj->heap_handle;
+
+ if (delete_multipath_adjacency)
+ ip_multipath_del_adjacency (lm, adj_index);
+
+ ip_poison_adjacencies (adj, adj->n_adj);
+
+ heap_dealloc (lm->adjacency_heap, handle);
+}
+
+void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index)
+{ ip_del_adjacency2 (lm, adj_index, /* delete_multipath_adjacency */ 1); }
+
+static int
+next_hop_sort_by_weight (ip_multipath_next_hop_t * n1,
+ ip_multipath_next_hop_t * n2)
+{
+ int cmp = (int) n1->weight - (int) n2->weight;
+ return (cmp == 0
+ ? (int) n1->next_hop_adj_index - (int) n2->next_hop_adj_index
+ : (cmp > 0 ? +1 : -1));
+}
+
+/* Given next hop vector is over-written with normalized one with sorted weights and
+ with weights corresponding to the number of adjacencies for each next hop.
+ Returns number of adjacencies in block. */
+static u32 ip_multipath_normalize_next_hops (ip_lookup_main_t * lm,
+ ip_multipath_next_hop_t * raw_next_hops,
+ ip_multipath_next_hop_t ** normalized_next_hops)
+{
+ ip_multipath_next_hop_t * nhs;
+ uword n_nhs, n_adj, n_adj_left, i;
+ f64 sum_weight, norm, error;
+
+ n_nhs = vec_len (raw_next_hops);
+ ASSERT (n_nhs > 0);
+ if (n_nhs == 0)
+ return 0;
+
+ /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
+ nhs = *normalized_next_hops;
+ vec_validate (nhs, 2*n_nhs - 1);
+
+ /* Fast path: 1 next hop in block. */
+ n_adj = n_nhs;
+ if (n_nhs == 1)
+ {
+ nhs[0] = raw_next_hops[0];
+ nhs[0].weight = 1;
+ _vec_len (nhs) = 1;
+ goto done;
+ }
+
+ else if (n_nhs == 2)
+ {
+ int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
+
+ /* Fast sort. */
+ nhs[0] = raw_next_hops[cmp];
+ nhs[1] = raw_next_hops[cmp ^ 1];
+
+ /* Fast path: equal cost multipath with 2 next hops. */
+ if (nhs[0].weight == nhs[1].weight)
+ {
+ nhs[0].weight = nhs[1].weight = 1;
+ _vec_len (nhs) = 2;
+ goto done;
+ }
+ }
+ else
+ {
+ memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
+ qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
+ }
+
+ /* Find total weight to normalize weights. */
+ sum_weight = 0;
+ for (i = 0; i < n_nhs; i++)
+ sum_weight += nhs[i].weight;
+
+ /* In the unlikely case that all weights are given as 0, set them all to 1. */
+ if (sum_weight == 0)
+ {
+ for (i = 0; i < n_nhs; i++)
+ nhs[i].weight = 1;
+ sum_weight = n_nhs;
+ }
+
+ /* Save copies of all next hop weights to avoid being overwritten in loop below. */
+ for (i = 0; i < n_nhs; i++)
+ nhs[n_nhs + i].weight = nhs[i].weight;
+
+ /* Try larger and larger power of 2 sized adjacency blocks until we
+ find one where traffic flows to within 1% of specified weights. */
+ for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
+ {
+ error = 0;
+
+ norm = n_adj / sum_weight;
+ n_adj_left = n_adj;
+ for (i = 0; i < n_nhs; i++)
+ {
+ f64 nf = nhs[n_nhs + i].weight * norm; /* use saved weights */
+ word n = flt_round_nearest (nf);
+
+ n = n > n_adj_left ? n_adj_left : n;
+ n_adj_left -= n;
+ error += fabs (nf - n);
+ nhs[i].weight = n;
+ }
+
+ nhs[0].weight += n_adj_left;
+
+ /* Less than 5% average error per adjacency with this size adjacency block? */
+ if (error <= lm->multipath_next_hop_error_tolerance*n_adj)
+ {
+ /* Truncate any next hops with zero weight. */
+ _vec_len (nhs) = i;
+ break;
+ }
+ }
+
+ done:
+ /* Save vector for next call. */
+ *normalized_next_hops = nhs;
+ return n_adj;
+}
+
+always_inline uword
+ip_next_hop_hash_key_from_handle (uword handle)
+{ return 1 + 2*handle; }
+
+always_inline uword
+ip_next_hop_hash_key_is_heap_handle (uword k)
+{ return k & 1; }
+
+always_inline uword
+ip_next_hop_hash_key_get_heap_handle (uword k)
+{
+ ASSERT (ip_next_hop_hash_key_is_heap_handle (k));
+ return k / 2;
+}
+
+static u32
+ip_multipath_adjacency_get (ip_lookup_main_t * lm,
+ ip_multipath_next_hop_t * raw_next_hops,
+ uword create_if_non_existent)
+{
+ uword * p;
+ u32 i, j, n_adj, adj_index, adj_heap_handle;
+ ip_adjacency_t * adj, * copy_adj;
+ ip_multipath_next_hop_t * nh, * nhs;
+ ip_multipath_adjacency_t * madj;
+
+ n_adj = ip_multipath_normalize_next_hops (lm, raw_next_hops, &lm->next_hop_hash_lookup_key_normalized);
+ nhs = lm->next_hop_hash_lookup_key_normalized;
+
+ /* Basic sanity. */
+ ASSERT (n_adj >= vec_len (raw_next_hops));
+
+ /* Use normalized next hops to see if we've seen a block equivalent to this one before. */
+ p = hash_get_mem (lm->multipath_adjacency_by_next_hops, nhs);
+ if (p)
+ return p[0];
+
+ if (! create_if_non_existent)
+ return 0;
+
+ adj = ip_add_adjacency (lm, /* copy_adj */ 0, n_adj, &adj_index);
+ adj_heap_handle = adj[0].heap_handle;
+
+ /* Fill in adjacencies in block based on corresponding next hop adjacencies. */
+ i = 0;
+ vec_foreach (nh, nhs)
+ {
+ copy_adj = ip_get_adjacency (lm, nh->next_hop_adj_index);
+ for (j = 0; j < nh->weight; j++)
+ {
+ adj[i] = copy_adj[0];
+ adj[i].heap_handle = adj_heap_handle;
+ adj[i].n_adj = n_adj;
+ i++;
+ }
+ }
+
+ /* All adjacencies should have been initialized. */
+ ASSERT (i == n_adj);
+
+ vec_validate (lm->multipath_adjacencies, adj_heap_handle);
+ madj = vec_elt_at_index (lm->multipath_adjacencies, adj_heap_handle);
+
+ madj->adj_index = adj_index;
+ madj->n_adj_in_block = n_adj;
+ madj->reference_count = 0; /* caller will set to one. */
+
+ madj->normalized_next_hops.count = vec_len (nhs);
+ madj->normalized_next_hops.heap_offset
+ = heap_alloc (lm->next_hop_heap, vec_len (nhs),
+ madj->normalized_next_hops.heap_handle);
+ memcpy (lm->next_hop_heap + madj->normalized_next_hops.heap_offset,
+ nhs, vec_bytes (nhs));
+
+ hash_set (lm->multipath_adjacency_by_next_hops,
+ ip_next_hop_hash_key_from_handle (madj->normalized_next_hops.heap_handle),
+ madj - lm->multipath_adjacencies);
+
+ madj->unnormalized_next_hops.count = vec_len (raw_next_hops);
+ madj->unnormalized_next_hops.heap_offset
+ = heap_alloc (lm->next_hop_heap, vec_len (raw_next_hops),
+ madj->unnormalized_next_hops.heap_handle);
+ memcpy (lm->next_hop_heap + madj->unnormalized_next_hops.heap_offset,
+ raw_next_hops, vec_bytes (raw_next_hops));
+
+ ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0);
+
+ return adj_heap_handle;
+}
+
+/* Returns 0 for next hop not found. */
+u32
+ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm,
+ u32 is_del,
+ u32 old_mp_adj_index,
+ u32 next_hop_adj_index,
+ u32 next_hop_weight,
+ u32 * new_mp_adj_index)
+{
+ ip_multipath_adjacency_t * mp_old, * mp_new;
+ ip_multipath_next_hop_t * nh, * nhs, * hash_nhs;
+ u32 n_nhs, i_nh;
+
+ mp_new = mp_old = 0;
+ n_nhs = 0;
+ i_nh = 0;
+ nhs = 0;
+
+ /* If old multipath adjacency is valid, find requested next hop. */
+ if (old_mp_adj_index < vec_len (lm->multipath_adjacencies)
+ && lm->multipath_adjacencies[old_mp_adj_index].normalized_next_hops.count > 0)
+ {
+ mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
+
+ nhs = vec_elt_at_index (lm->next_hop_heap, mp_old->unnormalized_next_hops.heap_offset);
+ n_nhs = mp_old->unnormalized_next_hops.count;
+
+ /* Linear search: ok since n_next_hops is small. */
+ for (i_nh = 0; i_nh < n_nhs; i_nh++)
+ if (nhs[i_nh].next_hop_adj_index == next_hop_adj_index)
+ break;
+
+ /* Given next hop not found. */
+ if (i_nh >= n_nhs && is_del)
+ return 0;
+ }
+
+ hash_nhs = lm->next_hop_hash_lookup_key;
+ if (hash_nhs)
+ _vec_len (hash_nhs) = 0;
+
+ if (is_del)
+ {
+ if (n_nhs > 1)
+ {
+ /* Prepare lookup key for multipath with target next hop deleted. */
+ if (i_nh > 0)
+ vec_add (hash_nhs, nhs + 0, i_nh);
+ if (i_nh + 1 < n_nhs)
+ vec_add (hash_nhs, nhs + i_nh + 1, n_nhs - (i_nh + 1));
+ }
+ }
+ else /* it's an add. */
+ {
+ /* If next hop is already there with the same weight, we have nothing to do. */
+ if (i_nh < n_nhs && nhs[i_nh].weight == next_hop_weight)
+ {
+ new_mp_adj_index[0] = ~0;
+ goto done;
+ }
+
+ /* Copy old next hops to lookup key vector. */
+ if (n_nhs > 0)
+ vec_add (hash_nhs, nhs, n_nhs);
+
+ if (i_nh < n_nhs)
+ {
+ /* Change weight of existing next hop. */
+ nh = vec_elt_at_index (hash_nhs, i_nh);
+ }
+ else
+ {
+ /* Add a new next hop. */
+ vec_add2 (hash_nhs, nh, 1);
+ nh->next_hop_adj_index = next_hop_adj_index;
+ }
+
+ /* Set weight for added or old next hop. */
+ nh->weight = next_hop_weight;
+ }
+
+ if (vec_len (hash_nhs) > 0)
+ {
+ u32 tmp = ip_multipath_adjacency_get (lm, hash_nhs,
+ /* create_if_non_existent */ 1);
+ if (tmp != ~0)
+ mp_new = vec_elt_at_index (lm->multipath_adjacencies, tmp);
+
+ /* Fetch again since pool may have moved. */
+ if (mp_old)
+ mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index);
+ }
+
+ new_mp_adj_index[0] = mp_new ? mp_new - lm->multipath_adjacencies : ~0;
+
+ if (mp_new != mp_old)
+ {
+ if (mp_old)
+ {
+ ASSERT (mp_old->reference_count > 0);
+ mp_old->reference_count -= 1;
+ }
+ if (mp_new)
+ mp_new->reference_count += 1;
+ }
+
+ if (mp_old && mp_old->reference_count == 0)
+ ip_multipath_adjacency_free (lm, mp_old);
+
+ done:
+ /* Save key vector next call. */
+ lm->next_hop_hash_lookup_key = hash_nhs;
+
+ return 1;
+}
+
+static void
+ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index)
+{
+ ip_adjacency_t * adj = ip_get_adjacency (lm, del_adj_index);
+ ip_multipath_adjacency_t * madj, * new_madj;
+ ip_multipath_next_hop_t * nhs, * hash_nhs;
+ u32 i, n_nhs, madj_index, new_madj_index;
+
+ if (adj->heap_handle >= vec_len (lm->multipath_adjacencies))
+ return;
+
+ vec_validate (lm->adjacency_remap_table, vec_len (lm->adjacency_heap) - 1);
+
+ for (madj_index = 0; madj_index < vec_len (lm->multipath_adjacencies); madj_index++)
+ {
+ madj = vec_elt_at_index (lm->multipath_adjacencies, madj_index);
+ if (madj->n_adj_in_block == 0)
+ continue;
+
+ nhs = heap_elt_at_index (lm->next_hop_heap, madj->unnormalized_next_hops.heap_offset);
+ n_nhs = madj->unnormalized_next_hops.count;
+ for (i = 0; i < n_nhs; i++)
+ if (nhs[i].next_hop_adj_index == del_adj_index)
+ break;
+
+ /* del_adj_index not found in unnormalized_next_hops? We're done. */
+ if (i >= n_nhs)
+ continue;
+
+ new_madj = 0;
+ if (n_nhs > 1)
+ {
+ hash_nhs = lm->next_hop_hash_lookup_key;
+ if (hash_nhs)
+ _vec_len (hash_nhs) = 0;
+ if (i > 0)
+ vec_add (hash_nhs, nhs + 0, i);
+ if (i + 1 < n_nhs)
+ vec_add (hash_nhs, nhs + i + 1, n_nhs - (i + 1));
+
+ new_madj_index = ip_multipath_adjacency_get (lm, hash_nhs, /* create_if_non_existent */ 1);
+
+ lm->next_hop_hash_lookup_key = hash_nhs;
+
+ if (new_madj_index == madj_index)
+ continue;
+
+ new_madj = vec_elt_at_index (lm->multipath_adjacencies, new_madj_index);
+ }
+
+ lm->adjacency_remap_table[madj->adj_index] = new_madj ? 1 + new_madj->adj_index : ~0;
+ lm->n_adjacency_remaps += 1;
+ ip_multipath_adjacency_free (lm, madj);
+ }
+}
+
+void
+ip_multipath_adjacency_free (ip_lookup_main_t * lm,
+ ip_multipath_adjacency_t * a)
+{
+ hash_unset (lm->multipath_adjacency_by_next_hops,
+ ip_next_hop_hash_key_from_handle (a->normalized_next_hops.heap_handle));
+ heap_dealloc (lm->next_hop_heap, a->normalized_next_hops.heap_handle);
+ heap_dealloc (lm->next_hop_heap, a->unnormalized_next_hops.heap_handle);
+
+ ip_del_adjacency2 (lm, a->adj_index, a->reference_count == 0);
+ memset (a, 0, sizeof (a[0]));
+}
+
+always_inline ip_multipath_next_hop_t *
+ip_next_hop_hash_key_get_next_hops (ip_lookup_main_t * lm, uword k,
+ uword * n_next_hops)
+{
+ ip_multipath_next_hop_t * nhs;
+ uword n_nhs;
+ if (ip_next_hop_hash_key_is_heap_handle (k))
+ {
+ uword handle = ip_next_hop_hash_key_get_heap_handle (k);
+ nhs = heap_elt_with_handle (lm->next_hop_heap, handle);
+ n_nhs = heap_len (lm->next_hop_heap, handle);
+ }
+ else
+ {
+ nhs = uword_to_pointer (k, ip_multipath_next_hop_t *);
+ n_nhs = vec_len (nhs);
+ }
+ *n_next_hops = n_nhs;
+ return nhs;
+}
+
+static uword
+ip_next_hop_hash_key_sum (hash_t * h, uword key0)
+{
+ ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *);
+ ip_multipath_next_hop_t * k0;
+ uword n0;
+
+ k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0);
+ return hash_memory (k0, n0 * sizeof (k0[0]), /* seed */ n0);
+}
+
+static uword
+ip_next_hop_hash_key_equal (hash_t * h, uword key0, uword key1)
+{
+ ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *);
+ ip_multipath_next_hop_t * k0, * k1;
+ uword n0, n1;
+
+ k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0);
+ k1 = ip_next_hop_hash_key_get_next_hops (lm, key1, &n1);
+
+ return n0 == n1 && ! memcmp (k0, k1, n0 * sizeof (k0[0]));
+}
+
+clib_error_t *
+ip_interface_address_add_del (ip_lookup_main_t * lm,
+ u32 sw_if_index,
+ void * addr_fib,
+ u32 address_length,
+ u32 is_del,
+ u32 * result_if_address_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip_interface_address_t * a, * prev, * next;
+ uword * p = mhash_get (&lm->address_to_if_address_index, addr_fib);
+
+ vec_validate_init_empty (lm->if_address_pool_index_by_sw_if_index, sw_if_index, ~0);
+ a = p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0;
+
+ /* Verify given length. */
+ if ((a && (address_length != a->address_length)) || (address_length == 0))
+ {
+ vnm->api_errno = VNET_API_ERROR_ADDRESS_LENGTH_MISMATCH;
+ return clib_error_create
+ ( "%U wrong length (expected %d) for interface %U",
+ lm->format_address_and_length, addr_fib,
+ address_length, a? a->address_length : -1,
+ format_vnet_sw_if_index_name, vnm, sw_if_index);
+ }
+
+ if (is_del)
+ {
+ if (!a)
+ {
+ vnet_sw_interface_t * si = vnet_get_sw_interface (vnm, sw_if_index);
+ vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_FOUND_FOR_INTERFACE;
+ return clib_error_create ("%U not found for interface %U",
+ lm->format_address_and_length,
+ addr_fib, address_length,
+ format_vnet_sw_interface_name, vnm, si);
+ }
+
+ if (a->prev_this_sw_interface != ~0)
+ {
+ prev = pool_elt_at_index (lm->if_address_pool, a->prev_this_sw_interface);
+ prev->next_this_sw_interface = a->next_this_sw_interface;
+ }
+ if (a->next_this_sw_interface != ~0)
+ {
+ next = pool_elt_at_index (lm->if_address_pool, a->next_this_sw_interface);
+ next->prev_this_sw_interface = a->prev_this_sw_interface;
+
+ if(a->prev_this_sw_interface == ~0)
+ lm->if_address_pool_index_by_sw_if_index[sw_if_index] = a->next_this_sw_interface;
+ }
+
+ if ((a->next_this_sw_interface == ~0) && (a->prev_this_sw_interface == ~0))
+ lm->if_address_pool_index_by_sw_if_index[sw_if_index] = ~0;
+
+ mhash_unset (&lm->address_to_if_address_index, addr_fib,
+ /* old_value */ 0);
+ pool_put (lm->if_address_pool, a);
+
+ if (result_if_address_index)
+ *result_if_address_index = ~0;
+ }
+
+ else if (! a)
+ {
+ u32 pi; /* previous index */
+ u32 ai;
+ u32 hi; /* head index */
+
+ pool_get (lm->if_address_pool, a);
+ memset (a, ~0, sizeof (a[0]));
+ ai = a - lm->if_address_pool;
+
+ hi = pi = lm->if_address_pool_index_by_sw_if_index[sw_if_index];
+ prev = 0;
+ while (pi != (u32)~0)
+ {
+ prev = pool_elt_at_index(lm->if_address_pool, pi);
+ pi = prev->next_this_sw_interface;
+ }
+ pi = prev ? prev - lm->if_address_pool : (u32)~0;
+
+ a->address_key = mhash_set (&lm->address_to_if_address_index,
+ addr_fib, ai, /* old_value */ 0);
+ a->address_length = address_length;
+ a->sw_if_index = sw_if_index;
+ a->flags = 0;
+ a->prev_this_sw_interface = pi;
+ a->next_this_sw_interface = ~0;
+ if (prev)
+ prev->next_this_sw_interface = ai;
+
+ lm->if_address_pool_index_by_sw_if_index[sw_if_index] =
+ (hi != ~0) ? hi : ai;
+ if (result_if_address_index)
+ *result_if_address_index = ai;
+ }
+ else
+ {
+ if (result_if_address_index)
+ *result_if_address_index = a - lm->if_address_pool;
+ }
+
+
+ return /* no error */ 0;
+}
+
+void serialize_vec_ip_adjacency (serialize_main_t * m, va_list * va)
+{
+ ip_adjacency_t * a = va_arg (*va, ip_adjacency_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++)
+ {
+ serialize_integer (m, a[i].heap_handle, sizeof (a[i].heap_handle));
+ serialize_integer (m, a[i].n_adj, sizeof (a[i].n_adj));
+ serialize_integer (m, a[i].lookup_next_index, sizeof (a[i].lookup_next_index_as_int));
+ switch (a[i].lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_LOCAL:
+ serialize_integer (m, a[i].if_address_index, sizeof (a[i].if_address_index));
+ break;
+
+ case IP_LOOKUP_NEXT_ARP:
+ serialize_integer (m, a[i].if_address_index, sizeof (a[i].if_address_index));
+ serialize_integer (m, a[i].rewrite_header.sw_if_index, sizeof (a[i].rewrite_header.sw_if_index));
+ break;
+
+ case IP_LOOKUP_NEXT_REWRITE:
+ serialize (m, serialize_vnet_rewrite, &a[i].rewrite_header, sizeof (a[i].rewrite_data));
+ break;
+
+ default:
+ /* nothing else to serialize. */
+ break;
+ }
+ }
+}
+
+void unserialize_vec_ip_adjacency (serialize_main_t * m, va_list * va)
+{
+ ip_adjacency_t * a = va_arg (*va, ip_adjacency_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ ip_poison_adjacencies (a, n);
+ for (i = 0; i < n; i++)
+ {
+ unserialize_integer (m, &a[i].heap_handle, sizeof (a[i].heap_handle));
+ unserialize_integer (m, &a[i].n_adj, sizeof (a[i].n_adj));
+ unserialize_integer (m, &a[i].lookup_next_index_as_int, sizeof (a[i].lookup_next_index_as_int));
+ switch (a[i].lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_LOCAL:
+ unserialize_integer (m, &a[i].if_address_index, sizeof (a[i].if_address_index));
+ break;
+
+ case IP_LOOKUP_NEXT_ARP:
+ unserialize_integer (m, &a[i].if_address_index, sizeof (a[i].if_address_index));
+ unserialize_integer (m, &a[i].rewrite_header.sw_if_index, sizeof (a[i].rewrite_header.sw_if_index));
+ break;
+
+ case IP_LOOKUP_NEXT_REWRITE:
+ unserialize (m, unserialize_vnet_rewrite, &a[i].rewrite_header, sizeof (a[i].rewrite_data));
+ break;
+
+ default:
+ /* nothing else to unserialize. */
+ break;
+ }
+ }
+}
+
+static void serialize_vec_ip_multipath_next_hop (serialize_main_t * m, va_list * va)
+{
+ ip_multipath_next_hop_t * nh = va_arg (*va, ip_multipath_next_hop_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++)
+ {
+ serialize_integer (m, nh[i].next_hop_adj_index, sizeof (nh[i].next_hop_adj_index));
+ serialize_integer (m, nh[i].weight, sizeof (nh[i].weight));
+ }
+}
+
+static void unserialize_vec_ip_multipath_next_hop (serialize_main_t * m, va_list * va)
+{
+ ip_multipath_next_hop_t * nh = va_arg (*va, ip_multipath_next_hop_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++)
+ {
+ unserialize_integer (m, &nh[i].next_hop_adj_index, sizeof (nh[i].next_hop_adj_index));
+ unserialize_integer (m, &nh[i].weight, sizeof (nh[i].weight));
+ }
+}
+
+static void serialize_vec_ip_multipath_adjacency (serialize_main_t * m, va_list * va)
+{
+ ip_multipath_adjacency_t * a = va_arg (*va, ip_multipath_adjacency_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++)
+ {
+#define foreach_ip_multipath_adjacency_field \
+ _ (adj_index) _ (n_adj_in_block) _ (reference_count) \
+ _ (normalized_next_hops.count) \
+ _ (normalized_next_hops.heap_offset) \
+ _ (normalized_next_hops.heap_handle) \
+ _ (unnormalized_next_hops.count) \
+ _ (unnormalized_next_hops.heap_offset) \
+ _ (unnormalized_next_hops.heap_handle)
+
+#define _(f) serialize_integer (m, a[i].f, sizeof (a[i].f));
+ foreach_ip_multipath_adjacency_field;
+#undef _
+ }
+}
+
+static void unserialize_vec_ip_multipath_adjacency (serialize_main_t * m, va_list * va)
+{
+ ip_multipath_adjacency_t * a = va_arg (*va, ip_multipath_adjacency_t *);
+ u32 n = va_arg (*va, u32);
+ u32 i;
+ for (i = 0; i < n; i++)
+ {
+#define _(f) unserialize_integer (m, &a[i].f, sizeof (a[i].f));
+ foreach_ip_multipath_adjacency_field;
+#undef _
+ }
+}
+
+void serialize_ip_lookup_main (serialize_main_t * m, va_list * va)
+{
+ ip_lookup_main_t * lm = va_arg (*va, ip_lookup_main_t *);
+
+ /* If this isn't true you need to call e.g. ip4_maybe_remap_adjacencies
+ to make it true. */
+ ASSERT (lm->n_adjacency_remaps == 0);
+
+ serialize (m, serialize_heap, lm->adjacency_heap, serialize_vec_ip_adjacency);
+
+ serialize (m, serialize_heap, lm->next_hop_heap, serialize_vec_ip_multipath_next_hop);
+ vec_serialize (m, lm->multipath_adjacencies, serialize_vec_ip_multipath_adjacency);
+
+ /* Adjacency counters (FIXME disabled for now). */
+ if (0)
+ serialize (m, serialize_vlib_combined_counter_main, &lm->adjacency_counters, /* incremental */ 0);
+}
+
+void unserialize_ip_lookup_main (serialize_main_t * m, va_list * va)
+{
+ ip_lookup_main_t * lm = va_arg (*va, ip_lookup_main_t *);
+
+ unserialize (m, unserialize_heap, &lm->adjacency_heap, unserialize_vec_ip_adjacency);
+ unserialize (m, unserialize_heap, &lm->next_hop_heap, unserialize_vec_ip_multipath_next_hop);
+ vec_unserialize (m, &lm->multipath_adjacencies, unserialize_vec_ip_multipath_adjacency);
+
+ /* Build hash table from unserialized data. */
+ {
+ ip_multipath_adjacency_t * a;
+
+ vec_foreach (a, lm->multipath_adjacencies)
+ {
+ if (a->n_adj_in_block > 0 && a->reference_count > 0)
+ hash_set (lm->multipath_adjacency_by_next_hops,
+ ip_next_hop_hash_key_from_handle (a->normalized_next_hops.heap_handle),
+ a - lm->multipath_adjacencies);
+ }
+ }
+
+ /* Validate adjacency counters. */
+ vlib_validate_combined_counter (&lm->adjacency_counters,
+ vec_len (lm->adjacency_heap) - 1);
+
+ /* Adjacency counters (FIXME disabled for now). */
+ if (0)
+ unserialize (m, unserialize_vlib_combined_counter_main, &lm->adjacency_counters, /* incremental */ 0);
+}
+
+void ip_lookup_init (ip_lookup_main_t * lm, u32 is_ip6)
+{
+ ip_adjacency_t * adj;
+
+ /* Hand-craft special miss adjacency to use when nothing matches in the
+ routing table. Same for drop adjacency. */
+ adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->miss_adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_MISS;
+ ASSERT (lm->miss_adj_index == IP_LOOKUP_MISS_ADJ_INDEX);
+
+ adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->drop_adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_DROP;
+
+ adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->local_adj_index);
+ adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ adj->if_address_index = ~0;
+
+ if (! lm->fib_result_n_bytes)
+ lm->fib_result_n_bytes = sizeof (uword);
+
+ lm->multipath_adjacency_by_next_hops
+ = hash_create2 (/* elts */ 0,
+ /* user */ pointer_to_uword (lm),
+ /* value_bytes */ sizeof (uword),
+ ip_next_hop_hash_key_sum,
+ ip_next_hop_hash_key_equal,
+ /* format pair/arg */
+ 0, 0);
+
+ /* 1% max error tolerance for multipath. */
+ lm->multipath_next_hop_error_tolerance = .01;
+
+ lm->is_ip6 = is_ip6;
+ if (is_ip6)
+ {
+ lm->format_address_and_length = format_ip6_address_and_length;
+ mhash_init (&lm->address_to_if_address_index, sizeof (uword),
+ sizeof (ip6_address_fib_t));
+ }
+ else
+ {
+ lm->format_address_and_length = format_ip4_address_and_length;
+ mhash_init (&lm->address_to_if_address_index, sizeof (uword),
+ sizeof (ip4_address_fib_t));
+ }
+
+ {
+ int i;
+
+ /* Setup all IP protocols to be punted and builtin-unknown. */
+ for (i = 0; i < 256; i++)
+ {
+ lm->local_next_by_ip_protocol[i] = IP_LOCAL_NEXT_PUNT;
+ lm->builtin_protocol_by_ip_protocol[i] = IP_BUILTIN_PROTOCOL_UNKNOWN;
+ }
+#if 0
+ /* Eliot's TCP doesn't actually work */
+ lm->local_next_by_ip_protocol[IP_PROTOCOL_TCP] = IP_LOCAL_NEXT_TCP_LOOKUP;
+ lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_TCP] =
+ IP_BUILTIN_PROTOCOL_TCP;
+#endif
+
+ lm->local_next_by_ip_protocol[IP_PROTOCOL_UDP] = IP_LOCAL_NEXT_UDP_LOOKUP;
+ lm->local_next_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 : IP_PROTOCOL_ICMP] = IP_LOCAL_NEXT_ICMP;
+ lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_UDP] = IP_BUILTIN_PROTOCOL_UDP;
+ lm->builtin_protocol_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 : IP_PROTOCOL_ICMP] = IP_BUILTIN_PROTOCOL_ICMP;
+ }
+}
+
+u8 * format_ip_flow_hash_config (u8 * s, va_list * args)
+{
+ u32 flow_hash_config = va_arg (*args, u32);
+
+#define _(n,v) if (flow_hash_config & v) s = format (s, "%s ", #n);
+ foreach_flow_hash_bit;
+#undef _
+
+ return s;
+}
+
+u8 * format_ip_lookup_next (u8 * s, va_list * args)
+{
+ ip_lookup_next_t n = va_arg (*args, ip_lookup_next_t);
+ char * t = 0;
+
+ switch (n)
+ {
+ default:
+ s = format (s, "unknown %d", n);
+ return s;
+
+ case IP_LOOKUP_NEXT_MISS: t = "miss"; break;
+ case IP_LOOKUP_NEXT_DROP: t = "drop"; break;
+ case IP_LOOKUP_NEXT_PUNT: t = "punt"; break;
+ case IP_LOOKUP_NEXT_LOCAL: t = "local"; break;
+ case IP_LOOKUP_NEXT_ARP: t = "arp"; break;
+ case IP_LOOKUP_NEXT_CLASSIFY: t = "classify"; break;
+ case IP_LOOKUP_NEXT_MAP: t = "map"; break;
+ case IP_LOOKUP_NEXT_MAP_T: t = "map-t"; break;
+ case IP_LOOKUP_NEXT_SIXRD: t = "sixrd"; break;
+ case IP_LOOKUP_NEXT_REWRITE:
+ break;
+ }
+
+ if (t)
+ vec_add (s, t, strlen (t));
+
+ return s;
+}
+
+static u8 * format_ip_interface_address (u8 * s, va_list * args)
+{
+ ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
+ u32 if_address_index = va_arg (*args, u32);
+ ip_interface_address_t * ia = pool_elt_at_index (lm->if_address_pool, if_address_index);
+ void * a = ip_interface_address_get_address (lm, ia);
+
+ if (lm->is_ip6)
+ return format (s, "%U", format_ip6_address_and_length, a, ia->address_length);
+ else
+ return format (s, "%U", format_ip4_address_and_length, a, ia->address_length);
+}
+
+u8 * format_ip_adjacency (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
+ u32 adj_index = va_arg (*args, u32);
+ ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
+
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_REWRITE:
+ s = format (s, "%U",
+ format_vnet_rewrite,
+ vnm->vlib_main, &adj->rewrite_header, sizeof (adj->rewrite_data));
+ break;
+
+ default:
+ s = format (s, "%U", format_ip_lookup_next, adj->lookup_next_index);
+ if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
+ s = format (s, " %U",
+ format_vnet_sw_interface_name,
+ vnm,
+ vnet_get_sw_interface (vnm, adj->rewrite_header.sw_if_index));
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_ARP:
+ case IP_LOOKUP_NEXT_LOCAL:
+ if (adj->if_address_index != ~0)
+ s = format (s, " %U", format_ip_interface_address, lm, adj->if_address_index);
+ break;
+
+ case IP_LOOKUP_NEXT_CLASSIFY:
+ s = format (s, " table %d", adj->classify_table_index);
+
+ default:
+ break;
+ }
+ break;
+ }
+ if (adj->explicit_fib_index != ~0 && adj->explicit_fib_index != 0)
+ s = format (s, " lookup fib index %d", adj->explicit_fib_index);
+
+ return s;
+}
+
+u8 * format_ip_adjacency_packet_data (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *);
+ u32 adj_index = va_arg (*args, u32);
+ u8 * packet_data = va_arg (*args, u8 *);
+ u32 n_packet_data_bytes = va_arg (*args, u32);
+ ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index);
+
+ switch (adj->lookup_next_index)
+ {
+ case IP_LOOKUP_NEXT_REWRITE:
+ s = format (s, "%U",
+ format_vnet_rewrite_header,
+ vnm->vlib_main, &adj->rewrite_header, packet_data, n_packet_data_bytes);
+ break;
+
+ default:
+ break;
+ }
+
+ return s;
+}
+
+static uword unformat_ip_lookup_next (unformat_input_t * input, va_list * args)
+{
+ ip_lookup_next_t * result = va_arg (*args, ip_lookup_next_t *);
+ ip_lookup_next_t n;
+
+ if (unformat (input, "drop"))
+ n = IP_LOOKUP_NEXT_DROP;
+
+ else if (unformat (input, "punt"))
+ n = IP_LOOKUP_NEXT_PUNT;
+
+ else if (unformat (input, "local"))
+ n = IP_LOOKUP_NEXT_LOCAL;
+
+ else if (unformat (input, "arp"))
+ n = IP_LOOKUP_NEXT_ARP;
+
+ else if (unformat (input, "classify"))
+ n = IP_LOOKUP_NEXT_CLASSIFY;
+
+ else
+ return 0;
+
+ *result = n;
+ return 1;
+}
+
+static uword unformat_ip_adjacency (unformat_input_t * input, va_list * args)
+{
+ vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+ ip_adjacency_t * adj = va_arg (*args, ip_adjacency_t *);
+ u32 node_index = va_arg (*args, u32);
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index, is_ip6;
+ ip46_address_t a46;
+ ip_lookup_next_t next;
+
+ is_ip6 = node_index == ip6_rewrite_node.index;
+ adj->rewrite_header.node_index = node_index;
+ adj->explicit_fib_index = ~0;
+
+ if (unformat (input, "arp %U %U",
+ unformat_vnet_sw_interface, vnm, &sw_if_index,
+ unformat_ip46_address, &a46, is_ip6))
+ {
+ ip_lookup_main_t * lm = is_ip6 ? &ip6_main.lookup_main : &ip4_main.lookup_main;
+ ip_adjacency_t * a_adj;
+ u32 adj_index;
+
+ if (is_ip6)
+ adj_index = ip6_fib_lookup (&ip6_main, sw_if_index, &a46.ip6);
+ else
+ adj_index = ip4_fib_lookup (&ip4_main, sw_if_index, &a46.ip4);
+
+ a_adj = ip_get_adjacency (lm, adj_index);
+
+ if (a_adj->rewrite_header.sw_if_index != sw_if_index)
+ return 0;
+
+ if (is_ip6)
+ ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index);
+ else
+ ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index);
+ }
+
+ else if (unformat_user (input, unformat_ip_lookup_next, &next))
+ {
+ adj->lookup_next_index = next;
+ adj->if_address_index = ~0;
+ if (next == IP_LOOKUP_NEXT_LOCAL)
+ (void) unformat (input, "%d", &adj->if_address_index);
+ else if (next == IP_LOOKUP_NEXT_CLASSIFY)
+ if (!unformat (input, "%d", &adj->classify_table_index))
+ {
+ clib_warning ("classify adj must specify table index");
+ return 0;
+ }
+ }
+
+ else if (unformat_user (input,
+ unformat_vnet_rewrite,
+ vm, &adj->rewrite_header, sizeof (adj->rewrite_data)))
+ adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+
+ else
+ return 0;
+
+ return 1;
+}
+
+clib_error_t *
+vnet_ip_route_cmd (vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 table_id, is_del;
+ u32 weight, * weights = 0;
+ u32 * table_ids = 0;
+ u32 sw_if_index, * sw_if_indices = 0;
+ ip4_address_t ip4_addr, * ip4_dst_addresses = 0, * ip4_via_next_hops = 0;
+ ip6_address_t ip6_addr, * ip6_dst_addresses = 0, * ip6_via_next_hops = 0;
+ u32 dst_address_length, * dst_address_lengths = 0;
+ ip_adjacency_t parse_adj, * add_adj = 0;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ f64 count;
+ u32 outer_table_id;
+
+ is_del = 0;
+ table_id = 0;
+ count = 1;
+
+ /* Get a line of input. */
+ if (! unformat_user (main_input, unformat_line_input, line_input))
+ return 0;
+
+ memset(&parse_adj, 0, sizeof (parse_adj));
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "table %d", &table_id))
+ ;
+ else if (unformat (line_input, "del"))
+ is_del = 1;
+ else if (unformat (line_input, "add"))
+ is_del = 0;
+ else if (unformat (line_input, "count %f", &count))
+ ;
+
+ else if (unformat (line_input, "%U/%d",
+ unformat_ip4_address, &ip4_addr,
+ &dst_address_length))
+ {
+ vec_add1 (ip4_dst_addresses, ip4_addr);
+ vec_add1 (dst_address_lengths, dst_address_length);
+ }
+
+ else if (unformat (line_input, "%U/%d",
+ unformat_ip6_address, &ip6_addr,
+ &dst_address_length))
+ {
+ vec_add1 (ip6_dst_addresses, ip6_addr);
+ vec_add1 (dst_address_lengths, dst_address_length);
+ }
+
+ else if (unformat (line_input, "via %U %U weight %u",
+ unformat_ip4_address, &ip4_addr,
+ unformat_vnet_sw_interface, vnm, &sw_if_index,
+ &weight))
+ {
+ vec_add1 (ip4_via_next_hops, ip4_addr);
+ vec_add1 (sw_if_indices, sw_if_index);
+ vec_add1 (weights, weight);
+ vec_add1 (table_ids, (u32)~0);
+ }
+
+ else if (unformat (line_input, "via %U %U weight %u",
+ unformat_ip6_address, &ip6_addr,
+ unformat_vnet_sw_interface, vnm, &sw_if_index,
+ &weight))
+ {
+ vec_add1 (ip6_via_next_hops, ip6_addr);
+ vec_add1 (sw_if_indices, sw_if_index);
+ vec_add1 (weights, weight);
+ vec_add1 (table_ids, (u32)~0);
+ }
+
+ else if (unformat (line_input, "via %U %U",
+ unformat_ip4_address, &ip4_addr,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ vec_add1 (ip4_via_next_hops, ip4_addr);
+ vec_add1 (sw_if_indices, sw_if_index);
+ vec_add1 (weights, 1);
+ vec_add1 (table_ids, (u32)~0);
+ }
+
+ else if (unformat (line_input, "via %U %U",
+ unformat_ip6_address, &ip6_addr,
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ vec_add1 (ip6_via_next_hops, ip6_addr);
+ vec_add1 (sw_if_indices, sw_if_index);
+ vec_add1 (weights, 1);
+ vec_add1 (table_ids, (u32)~0);
+ }
+ else if (unformat (line_input, "via %U",
+ unformat_ip4_address, &ip4_addr))
+ {
+ vec_add1 (ip4_via_next_hops, ip4_addr);
+ vec_add1 (sw_if_indices, (u32)~0);
+ vec_add1 (weights, 1);
+ vec_add1 (table_ids, table_id);
+ }
+ else if (unformat (line_input, "via %U",
+ unformat_ip6_address, &ip6_addr))
+ {
+ vec_add1 (ip6_via_next_hops, ip6_addr);
+ vec_add1 (sw_if_indices, (u32)~0);
+ vec_add1 (weights, 1);
+ vec_add1 (table_ids, (u32)table_id);
+ }
+
+ else if (vec_len (ip4_dst_addresses) > 0
+ && unformat (line_input, "via %U",
+ unformat_ip_adjacency, vm, &parse_adj, ip4_rewrite_node.index))
+ vec_add1 (add_adj, parse_adj);
+
+ else if (vec_len (ip6_dst_addresses) > 0
+ && unformat (line_input, "via %U",
+ unformat_ip_adjacency, vm, &parse_adj, ip6_rewrite_node.index))
+ vec_add1 (add_adj, parse_adj);
+ else if (unformat (line_input, "lookup in table %d", &outer_table_id))
+ {
+ uword * p;
+
+ if (vec_len (ip4_dst_addresses) > 0)
+ p = hash_get (ip4_main.fib_index_by_table_id, outer_table_id);
+ else
+ p = hash_get (ip6_main.fib_index_by_table_id, outer_table_id);
+
+ if (p == 0)
+ {
+ error = clib_error_return (0, "Nonexistent outer table id %d",
+ outer_table_id);
+ goto done;
+ }
+
+ parse_adj.lookup_next_index = IP_LOOKUP_NEXT_LOCAL;
+ parse_adj.explicit_fib_index = p[0];
+ vec_add1 (add_adj, parse_adj);
+ }
+ else
+ {
+ error = unformat_parse_error (line_input);
+ goto done;
+ }
+ }
+
+ unformat_free (line_input);
+
+ if (vec_len (ip4_dst_addresses) + vec_len (ip6_dst_addresses) == 0)
+ {
+ error = clib_error_return (0, "expected ip4/ip6 destination address/length.");
+ goto done;
+ }
+
+ if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_dst_addresses) > 0)
+ {
+ error = clib_error_return (0, "mixed ip4/ip6 address/length.");
+ goto done;
+ }
+
+ if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_via_next_hops) > 0)
+ {
+ error = clib_error_return (0, "ip4 destinations with ip6 next hops.");
+ goto done;
+ }
+
+ if (vec_len (ip6_dst_addresses) > 0 && vec_len (ip4_via_next_hops) > 0)
+ {
+ error = clib_error_return (0, "ip6 destinations with ip4 next hops.");
+ goto done;
+ }
+
+ if (! is_del && vec_len (add_adj) + vec_len (weights) == 0)
+ {
+ error = clib_error_return (0, "no next hops or adjacencies to add.");
+ goto done;
+ }
+
+ if (vec_len(ip4_via_next_hops))
+ {
+ if (sw_if_indices[0] == (u32)~0)
+ {
+ u32 ai;
+ uword * p;
+ u32 fib_index;
+ ip_adjacency_t *nh_adj;
+
+ p = hash_get (ip4_main.fib_index_by_table_id, table_ids[0]);
+ if (p == 0)
+ {
+ error = clib_error_return (0, "Nonexistent FIB id %d",
+ table_ids[0]);
+ goto done;
+ }
+
+ fib_index = p[0];
+
+ ai = ip4_fib_lookup_with_table (&ip4_main,
+ fib_index,
+ ip4_via_next_hops,
+ 1 /* disable default route */);
+ if (ai == 0)
+ {
+ error = clib_error_return (0, "next hop %U not in FIB",
+ format_ip4_address,
+ ip4_via_next_hops);
+ goto done;
+ }
+ nh_adj = ip_get_adjacency (&ip4_main.lookup_main, ai);
+ vec_add1 (add_adj, nh_adj[0]);
+ }
+ }
+ if (vec_len(ip6_via_next_hops))
+ {
+ if (sw_if_indices[0] == (u32)~0)
+ {
+ u32 ai;
+ uword * p;
+ u32 fib_index;
+ ip_adjacency_t *nh_adj;
+
+ p = hash_get (ip6_main.fib_index_by_table_id, table_ids[0]);
+ if (p == 0)
+ {
+ error = clib_error_return (0, "Nonexistent FIB id %d",
+ table_ids[0]);
+ goto done;
+ }
+
+ fib_index = p[0];
+ ai = ip6_fib_lookup_with_table (&ip6_main,
+ fib_index,
+ ip6_via_next_hops);
+ if (ai == 0)
+ {
+ error = clib_error_return (0, "next hop %U not in FIB",
+ format_ip6_address,
+ ip6_via_next_hops);
+ goto done;
+ }
+ nh_adj = ip_get_adjacency (&ip6_main.lookup_main, ai);
+ vec_add1 (add_adj, nh_adj[0]);
+ }
+ }
+
+ {
+ int i;
+ ip4_main_t * im4 = &ip4_main;
+ ip6_main_t * im6 = &ip6_main;
+
+ for (i = 0; i < vec_len (ip4_dst_addresses); i++)
+ {
+ ip4_add_del_route_args_t a;
+
+ memset (&a, 0, sizeof (a));
+ a.flags = IP4_ROUTE_FLAG_TABLE_ID;
+ a.table_index_or_table_id = table_id;
+ a.dst_address = ip4_dst_addresses[i];
+ a.dst_address_length = dst_address_lengths[i];
+ a.adj_index = ~0;
+
+ if (is_del)
+ {
+ if (vec_len (ip4_via_next_hops) == 0)
+ {
+ uword * dst_hash, * dst_result;
+ u32 dst_address_u32;
+ ip4_fib_t * fib;
+
+ fib = find_ip4_fib_by_table_index_or_id (im4, table_id,
+ 0 /* by table id */);
+
+ a.flags |= IP4_ROUTE_FLAG_DEL;
+ dst_address_u32 = a.dst_address.as_u32
+ & im4->fib_masks[a.dst_address_length];
+
+ dst_hash =
+ fib->adj_index_by_dst_address[a.dst_address_length];
+ dst_result = hash_get (dst_hash, dst_address_u32);
+ if (dst_result)
+ a.adj_index = dst_result[0];
+ else
+ {
+ clib_warning ("%U/%d not in FIB",
+ format_ip4_address, &a.dst_address,
+ a.dst_address_length);
+ continue;
+ }
+
+ ip4_add_del_route (im4, &a);
+ ip4_maybe_remap_adjacencies (im4, table_id,
+ IP4_ROUTE_FLAG_TABLE_ID);
+ }
+ else
+ {
+ u32 i, j, n, f, incr;
+ ip4_address_t dst = a.dst_address;
+ f64 t[2];
+ n = count;
+ t[0] = vlib_time_now (vm);
+ incr = 1<<(32 - a.dst_address_length);
+ for (i = 0; i < n; i++)
+ {
+ f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0;
+ a.dst_address = dst;
+ for (j = 0; j < vec_len (ip4_via_next_hops); j++)
+ {
+ if (table_ids[j] != (u32)~0)
+ {
+ uword * p = hash_get (im4->fib_index_by_table_id,
+ table_ids[j]);
+ if (p == 0)
+ {
+ clib_warning ("no such FIB table %d",
+ table_ids[j]);
+ continue;
+ }
+ table_ids[j] = p[0];
+ }
+
+ ip4_add_del_route_next_hop (im4,
+ IP4_ROUTE_FLAG_DEL | f,
+ &a.dst_address,
+ a.dst_address_length,
+ &ip4_via_next_hops[j],
+ sw_if_indices[j],
+ weights[j], (u32)~0,
+ table_ids[j] /* fib index */);
+ }
+ dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32));
+ }
+ t[1] = vlib_time_now (vm);
+ if (count > 1)
+ vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0]));
+ }
+ }
+ else
+ {
+ if (vec_len (add_adj) > 0)
+ {
+ a.flags |= IP4_ROUTE_FLAG_ADD;
+ a.add_adj = add_adj;
+ a.n_add_adj = vec_len (add_adj);
+
+ ip4_add_del_route (im4, &a);
+ }
+ else if (vec_len (ip4_via_next_hops) > 0)
+ {
+ u32 i, j, n, f, incr;
+ ip4_address_t dst = a.dst_address;
+ f64 t[2];
+ n = count;
+ t[0] = vlib_time_now (vm);
+ incr = 1<<(32 - a.dst_address_length);
+ for (i = 0; i < n; i++)
+ {
+ f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0;
+ a.dst_address = dst;
+ for (j = 0; j < vec_len (ip4_via_next_hops); j++)
+ {
+ if (table_ids[j] != (u32)~0)
+ {
+ uword * p = hash_get (im4->fib_index_by_table_id,
+ table_ids[j]);
+ if (p == 0)
+ {
+ clib_warning ("no such FIB table %d",
+ table_ids[j]);
+ continue;
+ }
+ table_ids[j] = p[0];
+ }
+ ip4_add_del_route_next_hop (im4,
+ IP4_ROUTE_FLAG_ADD | f,
+ &a.dst_address,
+ a.dst_address_length,
+ &ip4_via_next_hops[j],
+ sw_if_indices[j],
+ weights[j], (u32)~0,
+ table_ids[j] /* fib index */);
+ }
+ dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32));
+ }
+ t[1] = vlib_time_now (vm);
+ if (count > 1)
+ vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0]));
+ }
+ }
+ }
+
+ for (i = 0; i < vec_len (ip6_dst_addresses); i++)
+ {
+ ip6_add_del_route_args_t a;
+
+
+ memset (&a, 0, sizeof (a));
+ a.flags = IP6_ROUTE_FLAG_TABLE_ID;
+ a.table_index_or_table_id = table_id;
+ a.dst_address = ip6_dst_addresses[i];
+ a.dst_address_length = dst_address_lengths[i];
+ a.adj_index = ~0;
+
+ if (is_del)
+ {
+ if (vec_len (ip6_via_next_hops) == 0)
+ {
+ BVT(clib_bihash_kv) kv, value;
+ ip6_address_t dst_address;
+ ip6_fib_t * fib;
+
+ fib = find_ip6_fib_by_table_index_or_id (im6, table_id,
+ 0 /* by table id */);
+
+ a.flags |= IP4_ROUTE_FLAG_DEL;
+
+ dst_address = ip6_dst_addresses[i];
+
+ ip6_address_mask (&dst_address,
+ &im6->fib_masks[dst_address_length]);
+
+ kv.key[0] = dst_address.as_u64[0];
+ kv.key[1] = dst_address.as_u64[1];
+ kv.key[2] = ((u64)(fib - im6->fibs)<<32)
+ | a.dst_address_length;
+
+ if (BV(clib_bihash_search)(&im6->ip6_lookup_table,
+ &kv, &value) == 0)
+ a.adj_index = value.value;
+ else
+ {
+ clib_warning ("%U/%d not in FIB",
+ format_ip6_address, &a.dst_address,
+ a.dst_address_length);
+ continue;
+ }
+
+ a.flags |= IP6_ROUTE_FLAG_DEL;
+ ip6_add_del_route (im6, &a);
+ ip6_maybe_remap_adjacencies (im6, table_id,
+ IP6_ROUTE_FLAG_TABLE_ID);
+ }
+ else
+ {
+ u32 i;
+ for (i = 0; i < vec_len (ip6_via_next_hops); i++)
+ {
+ ip6_add_del_route_next_hop (im6,
+ IP6_ROUTE_FLAG_DEL,
+ &a.dst_address,
+ a.dst_address_length,
+ &ip6_via_next_hops[i],
+ sw_if_indices[i],
+ weights[i], (u32)~0,
+ table_ids[i] /* fib index */);
+ }
+ }
+ }
+ else
+ {
+ if (vec_len (add_adj) > 0)
+ {
+ a.flags |= IP6_ROUTE_FLAG_ADD;
+ a.add_adj = add_adj;
+ a.n_add_adj = vec_len (add_adj);
+
+ ip6_add_del_route (im6, &a);
+ }
+ else if (vec_len (ip6_via_next_hops) > 0)
+ {
+ u32 i;
+ for (i = 0; i < vec_len (ip6_via_next_hops); i++)
+ {
+ ip6_add_del_route_next_hop (im6,
+ IP6_ROUTE_FLAG_ADD,
+ &a.dst_address,
+ a.dst_address_length,
+ &ip6_via_next_hops[i],
+ sw_if_indices[i],
+ weights[i], (u32)~0,
+ table_ids[i]);
+ }
+ }
+ }
+ }
+ }
+
+ done:
+ vec_free (add_adj);
+ vec_free (weights);
+ vec_free (dst_address_lengths);
+ vec_free (ip4_dst_addresses);
+ vec_free (ip6_dst_addresses);
+ vec_free (ip4_via_next_hops);
+ vec_free (ip6_via_next_hops);
+ return error;
+}
+
+VLIB_CLI_COMMAND (vlib_cli_ip_command, static) = {
+ .path = "ip",
+ .short_help = "Internet protocol (IP) commands",
+};
+
+VLIB_CLI_COMMAND (vlib_cli_show_ip_command, static) = {
+ .path = "show ip",
+ .short_help = "Internet protocol (IP) show commands",
+};
+
+VLIB_CLI_COMMAND (vlib_cli_show_ip4_command, static) = {
+ .path = "show ip4",
+ .short_help = "Internet protocol version 4 (IP4) show commands",
+};
+
+VLIB_CLI_COMMAND (vlib_cli_show_ip6_command, static) = {
+ .path = "show ip6",
+ .short_help = "Internet protocol version 6 (IP6) show commands",
+};
+
+VLIB_CLI_COMMAND (ip_route_command, static) = {
+ .path = "ip route",
+ .short_help = "Add/delete IP routes",
+ .function = vnet_ip_route_cmd,
+};
+
+/*
+ * The next two routines address a longstanding script hemorrhoid.
+ * Probing a v4 or v6 neighbor needs to appear to be synchronous,
+ * or dependent route-adds will simply fail.
+ */
+static clib_error_t *
+ip6_probe_neighbor_wait (vlib_main_t *vm, ip6_address_t * a, u32 sw_if_index,
+ int retry_count)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * e;
+ int i;
+ int resolved = 0;
+ uword event_type;
+ uword *event_data = 0;
+
+ ASSERT (vlib_in_process_context(vm));
+
+ if (retry_count > 0)
+ vnet_register_ip6_neighbor_resolution_event
+ (vnm, a, vlib_get_current_process (vm)->node_runtime.node_index,
+ 1 /* event */, 0 /* data */);
+
+ for (i = 0; i < retry_count; i++)
+ {
+ /* The interface may be down, etc. */
+ e = ip6_probe_neighbor (vm, a, sw_if_index);
+
+ if (e)
+ return e;
+
+ vlib_process_wait_for_event_or_clock (vm, 1.0);
+ event_type = vlib_process_get_events (vm, &event_data);
+ switch (event_type)
+ {
+ case 1: /* resolved... */
+ vlib_cli_output (vm, "Resolved %U",
+ format_ip6_address, a);
+ resolved = 1;
+ goto done;
+
+ case ~0: /* timeout */
+ break;
+
+ default:
+ clib_warning ("unknown event_type %d", event_type);
+ }
+ }
+
+ done:
+ vec_reset_length (event_data);
+
+ if (!resolved)
+ return clib_error_return (0, "Resolution failed for %U",
+ format_ip6_address, a);
+ return 0;
+}
+
+static clib_error_t *
+ip4_probe_neighbor_wait (vlib_main_t *vm, ip4_address_t * a, u32 sw_if_index,
+ int retry_count)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * e;
+ int i;
+ int resolved = 0;
+ uword event_type;
+ uword *event_data = 0;
+
+ ASSERT (vlib_in_process_context(vm));
+
+ if (retry_count > 0)
+ vnet_register_ip4_arp_resolution_event
+ (vnm, a, vlib_get_current_process (vm)->node_runtime.node_index,
+ 1 /* event */, 0 /* data */);
+
+ for (i = 0; i < retry_count; i++)
+ {
+ /* The interface may be down, etc. */
+ e = ip4_probe_neighbor (vm, a, sw_if_index);
+
+ if (e)
+ return e;
+
+ vlib_process_wait_for_event_or_clock (vm, 1.0);
+ event_type = vlib_process_get_events (vm, &event_data);
+ switch (event_type)
+ {
+ case 1: /* resolved... */
+ vlib_cli_output (vm, "Resolved %U",
+ format_ip4_address, a);
+ resolved = 1;
+ goto done;
+
+ case ~0: /* timeout */
+ break;
+
+ default:
+ clib_warning ("unknown event_type %d", event_type);
+ }
+ }
+
+ done:
+
+ vec_reset_length (event_data);
+
+ if (!resolved)
+ return clib_error_return (0, "Resolution failed for %U",
+ format_ip4_address, a);
+ return 0;
+}
+
+static clib_error_t *
+probe_neighbor_address (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t a4;
+ ip6_address_t a6;
+ clib_error_t * error = 0;
+ u32 sw_if_index = ~0;
+ int retry_count = 3;
+ int is_ip4 = 1;
+ int address_set = 0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat_user (line_input, unformat_vnet_sw_interface, vnm,
+ &sw_if_index))
+ ;
+ else if (unformat (line_input, "retry %d", &retry_count))
+ ;
+
+ else if (unformat (line_input, "%U", unformat_ip4_address, &a4))
+ address_set++;
+ else if (unformat (line_input, "%U", unformat_ip6_address, &a6))
+ {
+ address_set++;
+ is_ip4 = 0;
+ }
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "Interface required, not set.");
+ if (address_set == 0)
+ return clib_error_return (0, "ip address required, not set.");
+ if (address_set > 1)
+ return clib_error_return (0, "Multiple ip addresses not supported.");
+
+ if (is_ip4)
+ error = ip4_probe_neighbor_wait (vm, &a4, sw_if_index, retry_count);
+ else
+ error = ip6_probe_neighbor_wait (vm, &a6, sw_if_index, retry_count);
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (ip_probe_neighbor_command, static) = {
+ .path = "ip probe-neighbor",
+ .function = probe_neighbor_address,
+ .short_help = "ip probe-neighbor <intfc> <ip4-addr> | <ip6-addr> [retry nn]",
+};
+
+typedef CLIB_PACKED (struct {
+ ip4_address_t address;
+
+ u32 address_length : 6;
+
+ u32 index : 26;
+}) ip4_route_t;
+
+static clib_error_t *
+ip4_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_main_t * im4 = &ip4_main;
+ ip4_route_t * routes, * r;
+ ip4_fib_t * fib;
+ ip_lookup_main_t * lm = &im4->lookup_main;
+ uword * results, i;
+ int verbose, matching, mtrie, include_empty_fibs;
+ ip4_address_t matching_address;
+ u8 clear = 0;
+ int table_id = -1;
+
+ routes = 0;
+ results = 0;
+ verbose = 1;
+ include_empty_fibs = 0;
+ matching = 0;
+ mtrie = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "brief") || unformat (input, "summary")
+ || unformat (input, "sum"))
+ verbose = 0;
+
+ else if (unformat (input, "mtrie"))
+ mtrie = 1;
+
+ else if (unformat (input, "include-empty"))
+ include_empty_fibs = 1;
+
+ else if (unformat (input, "%U", unformat_ip4_address, &matching_address))
+ matching = 1;
+
+ else if (unformat (input, "clear"))
+ clear = 1;
+
+ else if (unformat (input, "table %d", &table_id))
+ ;
+ else
+ break;
+ }
+
+ vec_foreach (fib, im4->fibs)
+ {
+ int fib_not_empty;
+
+ fib_not_empty = 0;
+ for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
+ {
+ uword * hash = fib->adj_index_by_dst_address[i];
+ uword n_elts = hash_elts (hash);
+ if (n_elts)
+ {
+ fib_not_empty = 1;
+ break;
+ }
+ }
+
+ if (fib_not_empty == 0 && include_empty_fibs == 0)
+ continue;
+
+ if (table_id >= 0 && table_id != (int)fib->table_id)
+ continue;
+
+ if (include_empty_fibs)
+ vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U",
+ fib->table_id, fib - im4->fibs,
+ format_ip_flow_hash_config, fib->flow_hash_config);
+
+ /* Show summary? */
+ if (! verbose)
+ {
+ if (include_empty_fibs == 0)
+ vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U",
+ fib->table_id, fib - im4->fibs,
+ format_ip_flow_hash_config, fib->flow_hash_config);
+ vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
+ for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
+ {
+ uword * hash = fib->adj_index_by_dst_address[i];
+ uword n_elts = hash_elts (hash);
+ if (n_elts > 0)
+ vlib_cli_output (vm, "%20d%16d", i, n_elts);
+ }
+ continue;
+ }
+
+ if (routes)
+ _vec_len (routes) = 0;
+ if (results)
+ _vec_len (results) = 0;
+
+ for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++)
+ {
+ uword * hash = fib->adj_index_by_dst_address[i];
+ hash_pair_t * p;
+ ip4_route_t x;
+
+ x.address_length = i;
+
+ if (matching)
+ {
+ x.address.as_u32 = matching_address.as_u32 & im4->fib_masks[i];
+ p = hash_get_pair (hash, x.address.as_u32);
+ if (p)
+ {
+ if (lm->fib_result_n_words > 1)
+ {
+ x.index = vec_len (results);
+ vec_add (results, p->value, lm->fib_result_n_words);
+ }
+ else
+ x.index = p->value[0];
+ vec_add1 (routes, x);
+ }
+ }
+ else
+ {
+ hash_foreach_pair (p, hash, ({
+ x.address.data_u32 = p->key;
+ if (lm->fib_result_n_words > 1)
+ {
+ x.index = vec_len (results);
+ vec_add (results, p->value, lm->fib_result_n_words);
+ }
+ else
+ x.index = p->value[0];
+
+ vec_add1 (routes, x);
+ }));
+ }
+ }
+
+ vec_sort (routes, r1, r2,
+ ({ int cmp = ip4_address_compare (&r1->address, &r2->address);
+ cmp ? cmp : ((int) r1->address_length - (int) r2->address_length); }));
+ if (vec_len(routes)) {
+ if (include_empty_fibs == 0)
+ vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U",
+ fib->table_id, fib - im4->fibs,
+ format_ip_flow_hash_config, fib->flow_hash_config);
+ if (mtrie)
+ vlib_cli_output (vm, "%U", format_ip4_fib_mtrie, &fib->mtrie);
+ vlib_cli_output (vm, "%=20s%=16s%=16s%=16s",
+ "Destination", "Packets", "Bytes", "Adjacency");
+ }
+ vec_foreach (r, routes)
+ {
+ vlib_counter_t c, sum;
+ uword i, j, n_left, n_nhs, adj_index, * result = 0;
+ ip_adjacency_t * adj;
+ ip_multipath_next_hop_t * nhs, tmp_nhs[1];
+
+ adj_index = r->index;
+ if (lm->fib_result_n_words > 1)
+ {
+ result = vec_elt_at_index (results, adj_index);
+ adj_index = result[0];
+ }
+
+ adj = ip_get_adjacency (lm, adj_index);
+ if (adj->n_adj == 1)
+ {
+ nhs = &tmp_nhs[0];
+ nhs[0].next_hop_adj_index = ~0; /* not used */
+ nhs[0].weight = 1;
+ n_nhs = 1;
+ }
+ else
+ {
+ ip_multipath_adjacency_t * madj;
+ madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle);
+ nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
+ n_nhs = madj->normalized_next_hops.count;
+ }
+
+ n_left = nhs[0].weight;
+ vlib_counter_zero (&sum);
+ for (i = j = 0; i < adj->n_adj; i++)
+ {
+ n_left -= 1;
+ vlib_get_combined_counter (&lm->adjacency_counters,
+ adj_index + i, &c);
+ if (clear)
+ vlib_zero_combined_counter (&lm->adjacency_counters,
+ adj_index + i);
+ vlib_counter_add (&sum, &c);
+ if (n_left == 0)
+ {
+ u8 * msg = 0;
+ uword indent;
+
+ if (j == 0)
+ msg = format (msg, "%-20U",
+ format_ip4_address_and_length,
+ r->address.data, r->address_length);
+ else
+ msg = format (msg, "%U", format_white_space, 20);
+
+ msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes);
+
+ indent = vec_len (msg);
+ msg = format (msg, "weight %d, index %d\n%U%U",
+ nhs[j].weight, adj_index + i,
+ format_white_space, indent,
+ format_ip_adjacency,
+ vnm, lm, adj_index + i);
+
+ vlib_cli_output (vm, "%v", msg);
+ vec_free (msg);
+
+ if (result && lm->format_fib_result)
+ vlib_cli_output (vm, "%20s%U", "",
+ lm->format_fib_result, vm, lm, result,
+ i + 1 - nhs[j].weight,
+ nhs[j].weight);
+
+ j++;
+ if (j < n_nhs)
+ {
+ n_left = nhs[j].weight;
+ vlib_counter_zero (&sum);
+ }
+ }
+ }
+ }
+ }
+
+ vec_free (routes);
+ vec_free (results);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ip4_show_fib_command, static) = {
+ .path = "show ip fib",
+ .short_help = "show ip fib [mtrie] [summary] [table <n>] [<ip4-addr>] [clear] [include-empty]",
+ .function = ip4_show_fib,
+};
+
+typedef struct {
+ ip6_address_t address;
+
+ u32 address_length;
+
+ u32 index;
+} ip6_route_t;
+
+typedef struct {
+ u32 fib_index;
+ ip6_route_t ** routep;
+} add_routes_in_fib_arg_t;
+
+static void add_routes_in_fib (BVT(clib_bihash_kv) * kvp, void *arg)
+{
+ add_routes_in_fib_arg_t * ap = arg;
+
+ if (kvp->key[2]>>32 == ap->fib_index)
+ {
+ ip6_address_t *addr;
+ ip6_route_t * r;
+ addr = (ip6_address_t *) kvp;
+ vec_add2 (*ap->routep, r, 1);
+ r->address = addr[0];
+ r->address_length = kvp->key[2] & 0xFF;
+ r->index = kvp->value;
+ }
+}
+
+typedef struct {
+ u32 fib_index;
+ u64 count_by_prefix_length[129];
+} count_routes_in_fib_at_prefix_length_arg_t;
+
+static void count_routes_in_fib_at_prefix_length
+(BVT(clib_bihash_kv) * kvp, void *arg)
+{
+ count_routes_in_fib_at_prefix_length_arg_t * ap = arg;
+ int mask_width;
+
+ if ((kvp->key[2]>>32) != ap->fib_index)
+ return;
+
+ mask_width = kvp->key[2] & 0xFF;
+
+ ap->count_by_prefix_length[mask_width]++;
+}
+
+
+static clib_error_t *
+ip6_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ ip6_main_t * im6 = &ip6_main;
+ ip6_route_t * routes, * r;
+ ip6_fib_t * fib;
+ ip_lookup_main_t * lm = &im6->lookup_main;
+ uword * results;
+ int verbose;
+ BVT(clib_bihash) * h = &im6->ip6_lookup_table;
+ __attribute__((unused)) u8 clear = 0;
+ add_routes_in_fib_arg_t _a, *a=&_a;
+ count_routes_in_fib_at_prefix_length_arg_t _ca, *ca = &_ca;
+
+ routes = 0;
+ results = 0;
+ verbose = 1;
+ if (unformat (input, "brief") || unformat (input, "summary")
+ || unformat (input, "sum"))
+ verbose = 0;
+
+ if (unformat (input, "clear"))
+ clear = 1;
+
+ vlib_cli_output (vm, "FIB lookup table: %d buckets, %lld MB heap",
+ im6->lookup_table_nbuckets, im6->lookup_table_size>>20);
+ vlib_cli_output (vm, "%U", format_mheap, h->mheap, 0 /*verbose*/);
+ vlib_cli_output (vm, " ");
+
+ vec_foreach (fib, im6->fibs)
+ {
+ vlib_cli_output (vm, "VRF %d, fib_index %d, flow hash: %U",
+ fib->table_id, fib - im6->fibs,
+ format_ip_flow_hash_config, fib->flow_hash_config);
+
+ /* Show summary? */
+ if (! verbose)
+ {
+ int len;
+ vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count");
+
+ memset (ca, 0, sizeof(*ca));
+ ca->fib_index = fib - im6->fibs;
+
+ BV(clib_bihash_foreach_key_value_pair)
+ (h, count_routes_in_fib_at_prefix_length, ca);
+
+ for (len = 128; len >= 0; len--)
+ {
+ if (ca->count_by_prefix_length[len])
+ vlib_cli_output (vm, "%=20d%=16lld",
+ len, ca->count_by_prefix_length[len]);
+ }
+ continue;
+ }
+
+ if (routes)
+ _vec_len (routes) = 0;
+ if (results)
+ _vec_len (results) = 0;
+
+ a->fib_index = fib - im6->fibs;
+ a->routep = &routes;
+
+ BV(clib_bihash_foreach_key_value_pair)(h, add_routes_in_fib, a);
+
+ vec_sort (routes, r1, r2,
+ ({ int cmp = ip6_address_compare (&r1->address, &r2->address);
+ cmp ? cmp : ((int) r1->address_length - (int) r2->address_length); }));
+
+ vlib_cli_output (vm, "%=45s%=16s%=16s%=16s",
+ "Destination", "Packets", "Bytes", "Adjacency");
+ vec_foreach (r, routes)
+ {
+ vlib_counter_t c, sum;
+ uword i, j, n_left, n_nhs, adj_index, * result = 0;
+ ip_adjacency_t * adj;
+ ip_multipath_next_hop_t * nhs, tmp_nhs[1];
+
+ adj_index = r->index;
+ if (lm->fib_result_n_words > 1)
+ {
+ result = vec_elt_at_index (results, adj_index);
+ adj_index = result[0];
+ }
+
+ adj = ip_get_adjacency (lm, adj_index);
+ if (adj->n_adj == 1)
+ {
+ nhs = &tmp_nhs[0];
+ nhs[0].next_hop_adj_index = ~0; /* not used */
+ nhs[0].weight = 1;
+ n_nhs = 1;
+ }
+ else
+ {
+ ip_multipath_adjacency_t * madj;
+ madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle);
+ nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset);
+ n_nhs = madj->normalized_next_hops.count;
+ }
+
+ n_left = nhs[0].weight;
+ vlib_counter_zero (&sum);
+ for (i = j = 0; i < adj->n_adj; i++)
+ {
+ n_left -= 1;
+ vlib_get_combined_counter (&lm->adjacency_counters,
+ adj_index + i, &c);
+ if (clear)
+ vlib_zero_combined_counter (&lm->adjacency_counters,
+ adj_index + i);
+ vlib_counter_add (&sum, &c);
+ if (n_left == 0)
+ {
+ u8 * msg = 0;
+ uword indent;
+
+ if (j == 0)
+ msg = format (msg, "%-45U",
+ format_ip6_address_and_length,
+ r->address.as_u8, r->address_length);
+ else
+ msg = format (msg, "%U", format_white_space, 20);
+
+ msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes);
+
+ indent = vec_len (msg);
+ msg = format (msg, "weight %d, index %d\n%U%U",
+ nhs[j].weight, adj_index + i,
+ format_white_space, indent,
+ format_ip_adjacency,
+ vnm, lm, adj_index + i);
+
+ vlib_cli_output (vm, "%v", msg);
+ vec_free (msg);
+
+ j++;
+ if (j < n_nhs)
+ {
+ n_left = nhs[j].weight;
+ vlib_counter_zero (&sum);
+ }
+ }
+ }
+
+ if (result && lm->format_fib_result)
+ vlib_cli_output (vm, "%20s%U", "", lm->format_fib_result, vm, lm, result, 0);
+ }
+ vlib_cli_output (vm, " ");
+ }
+
+ vec_free (routes);
+ vec_free (results);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ip6_show_fib_command, static) = {
+ .path = "show ip6 fib",
+ .short_help = "show ip6 fib [summary] [clear]",
+ .function = ip6_show_fib,
+};
diff --git a/vnet/vnet/ip/lookup.h b/vnet/vnet/ip/lookup.h
new file mode 100644
index 00000000000..e4e5acfece3
--- /dev/null
+++ b/vnet/vnet/ip/lookup.h
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/ip_lookup.h: ip (4 or 6) lookup structures, adjacencies, ...
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ip_lookup_h
+#define included_ip_lookup_h
+
+#include <vnet/vnet.h>
+#include <vlib/buffer.h>
+
+/* Next index stored in adjacency. */
+typedef enum {
+ /* Packet does not match any route in table. */
+ IP_LOOKUP_NEXT_MISS,
+
+ /* Adjacency says to drop or punt this packet. */
+ IP_LOOKUP_NEXT_DROP,
+ IP_LOOKUP_NEXT_PUNT,
+
+ /* This packet is for one of our own IP addresses. */
+ IP_LOOKUP_NEXT_LOCAL,
+
+ /* This packet matches an "interface route" and packets
+ need to be passed to ARP to find rewrite string for
+ this destination. */
+ IP_LOOKUP_NEXT_ARP,
+
+ /* This packet is to be rewritten and forwarded to the next
+ processing node. This is typically the output interface but
+ might be another node for further output processing. */
+ IP_LOOKUP_NEXT_REWRITE,
+
+ /* This packet needs to be classified */
+ IP_LOOKUP_NEXT_CLASSIFY,
+
+ /* This packet needs to go to MAP - RFC7596, RFC7597 */
+ IP_LOOKUP_NEXT_MAP,
+
+ /* This packet needs to go to MAP with Translation - RFC7599 */
+ IP_LOOKUP_NEXT_MAP_T,
+
+ /* This packets needs to go to 6RD (RFC5969) */
+ IP_LOOKUP_NEXT_SIXRD,
+
+ /* Hop-by-hop header handling */
+ IP_LOOKUP_NEXT_HOP_BY_HOP,
+ IP_LOOKUP_NEXT_ADD_HOP_BY_HOP,
+ IP_LOOKUP_NEXT_POP_HOP_BY_HOP,
+
+ IP_LOOKUP_N_NEXT,
+} ip_lookup_next_t;
+
+/* Flow hash configuration */
+#define IP_FLOW_HASH_SRC_ADDR (1<<0)
+#define IP_FLOW_HASH_DST_ADDR (1<<1)
+#define IP_FLOW_HASH_PROTO (1<<2)
+#define IP_FLOW_HASH_SRC_PORT (1<<3)
+#define IP_FLOW_HASH_DST_PORT (1<<4)
+#define IP_FLOW_HASH_REVERSE_SRC_DST (1<<5)
+
+/* Default: 5-tuple without the "reverse" bit */
+#define IP_FLOW_HASH_DEFAULT (0x1F)
+
+#define foreach_flow_hash_bit \
+_(src, IP_FLOW_HASH_SRC_ADDR) \
+_(dst, IP_FLOW_HASH_DST_ADDR) \
+_(sport, IP_FLOW_HASH_SRC_PORT) \
+_(dport, IP_FLOW_HASH_DST_PORT) \
+_(proto, IP_FLOW_HASH_PROTO) \
+_(reverse, IP_FLOW_HASH_REVERSE_SRC_DST)
+
+/* IP unicast adjacency. */
+typedef struct {
+ /* Handle for this adjacency in adjacency heap. */
+ u32 heap_handle;
+
+ /* Interface address index for this local/arp adjacency. */
+ u32 if_address_index;
+
+ /* Number of adjecencies in block. Greater than 1 means multipath;
+ otherwise equal to 1. */
+ u16 n_adj;
+
+ /* Next hop after ip4-lookup. */
+ union {
+ ip_lookup_next_t lookup_next_index : 16;
+ u16 lookup_next_index_as_int;
+ };
+
+ /* Force re-lookup in a different FIB. ~0 => normal behavior */
+ i16 explicit_fib_index;
+ u16 mcast_group_index;
+
+ /* When classifying, start here */
+ u16 classify_table_index;
+ /* Highest possible perf subgraph arc interposition, e.g. for ip6 ioam */
+ u16 saved_lookup_next_index;
+
+ vnet_declare_rewrite (VLIB_BUFFER_PRE_DATA_SIZE - 5*sizeof(u32));
+} ip_adjacency_t;
+
+/* Index into adjacency table. */
+typedef u32 ip_adjacency_index_t;
+
+typedef struct {
+ /* Directly connected next-hop adjacency index. */
+ u32 next_hop_adj_index;
+
+ /* Path weight for this adjacency. */
+ u32 weight;
+} ip_multipath_next_hop_t;
+
+typedef struct {
+ /* Adjacency index of first index in block. */
+ u32 adj_index;
+
+ /* Power of 2 size of adjacency block. */
+ u32 n_adj_in_block;
+
+ /* Number of prefixes that point to this adjacency. */
+ u32 reference_count;
+
+ /* Normalized next hops are used as hash keys: they are sorted by weight
+ and weights are chosen so they add up to 1 << log2_n_adj_in_block (with
+ zero-weighted next hops being deleted).
+ Unnormalized next hops are saved so that control plane has a record of exactly
+ what the RIB told it. */
+ struct {
+ /* Number of hops in the multipath. */
+ u32 count;
+
+ /* Offset into next hop heap for this block. */
+ u32 heap_offset;
+
+ /* Heap handle used to for example free block when we're done with it. */
+ u32 heap_handle;
+ } normalized_next_hops, unnormalized_next_hops;
+} ip_multipath_adjacency_t;
+
+/* IP multicast adjacency. */
+typedef struct {
+ /* Handle for this adjacency in adjacency heap. */
+ u32 heap_handle;
+
+ /* Number of adjecencies in block. */
+ u32 n_adj;
+
+ /* Rewrite string. */
+ vnet_declare_rewrite (64 - 2*sizeof(u32));
+} ip_multicast_rewrite_t;
+
+typedef struct {
+ /* ip4-multicast-rewrite next index. */
+ u32 next_index;
+
+ u8 n_rewrite_bytes;
+
+ u8 rewrite_string[64 - 1*sizeof(u32) - 1*sizeof(u8)];
+} ip_multicast_rewrite_string_t;
+
+typedef struct {
+ ip_multicast_rewrite_t * rewrite_heap;
+
+ ip_multicast_rewrite_string_t * rewrite_strings;
+
+ /* Negative rewrite string index; >= 0 sw_if_index.
+ Sorted. Used to hash. */
+ i32 ** adjacency_id_vector;
+
+ uword * adjacency_by_id_vector;
+} ip_multicast_lookup_main_t;
+
+typedef struct {
+ /* Key for mhash; in fact, just a byte offset into mhash key vector. */
+ u32 address_key;
+
+ /* Interface which has this address. */
+ u32 sw_if_index;
+
+ /* Adjacency for neighbor probe (ARP) for this interface address. */
+ u32 neighbor_probe_adj_index;
+
+ /* Address (prefix) length for this interface. */
+ u16 address_length;
+
+ /* Will be used for something eventually. Primary vs. secondary? */
+ u16 flags;
+
+ /* Next and previous pointers for doubly linked list of
+ addresses per software interface. */
+ u32 next_this_sw_interface;
+ u32 prev_this_sw_interface;
+} ip_interface_address_t;
+
+typedef enum {
+ IP_LOCAL_NEXT_DROP,
+ IP_LOCAL_NEXT_PUNT,
+ // IP_LOCAL_NEXT_TCP_LOOKUP,
+ IP_LOCAL_NEXT_UDP_LOOKUP,
+ IP_LOCAL_NEXT_ICMP,
+ IP_LOCAL_N_NEXT,
+} ip_local_next_t;
+
+struct ip_lookup_main_t;
+
+typedef void (* ip_add_del_adjacency_callback_t) (struct ip_lookup_main_t * lm,
+ u32 adj_index,
+ ip_adjacency_t * adj,
+ u32 is_del);
+
+typedef struct {
+ vnet_config_main_t config_main;
+
+ u32 * config_index_by_sw_if_index;
+} ip_config_main_t;
+
+typedef struct ip_lookup_main_t {
+ /* Adjacency heap. */
+ ip_adjacency_t * adjacency_heap;
+
+ /* Adjacency packet/byte counters indexed by adjacency index. */
+ vlib_combined_counter_main_t adjacency_counters;
+
+ /* Heap of (next hop, weight) blocks. Sorted by next hop. */
+ ip_multipath_next_hop_t * next_hop_heap;
+
+ /* Indexed by heap_handle from ip_adjacency_t. */
+ ip_multipath_adjacency_t * multipath_adjacencies;
+
+ /* Temporary vectors for looking up next hops in hash. */
+ ip_multipath_next_hop_t * next_hop_hash_lookup_key;
+ ip_multipath_next_hop_t * next_hop_hash_lookup_key_normalized;
+
+ /* Hash table mapping normalized next hops and weights
+ to multipath adjacency index. */
+ uword * multipath_adjacency_by_next_hops;
+
+ u32 * adjacency_remap_table;
+ u32 n_adjacency_remaps;
+
+ /* If average error per adjacency is less than this threshold adjacency block
+ size is accepted. */
+ f64 multipath_next_hop_error_tolerance;
+
+ /* Adjacency index for routing table misses, local punts, and drops. */
+ u32 miss_adj_index, drop_adj_index, local_adj_index;
+
+ /* Miss adjacency is always first in adjacency table. */
+#define IP_LOOKUP_MISS_ADJ_INDEX 0
+
+ ip_add_del_adjacency_callback_t * add_del_adjacency_callbacks;
+
+ /* Pool of addresses that are assigned to interfaces. */
+ ip_interface_address_t * if_address_pool;
+
+ /* Hash table mapping address to index in interface address pool. */
+ mhash_t address_to_if_address_index;
+
+ /* Head of doubly linked list of interface addresses for each software interface.
+ ~0 means this interface has no address. */
+ u32 * if_address_pool_index_by_sw_if_index;
+
+ /* First table index to use for this interface, ~0 => none */
+ u32 * classify_table_index_by_sw_if_index;
+
+ /* rx/tx interface/feature configuration. */
+ ip_config_main_t rx_config_mains[VNET_N_CAST], tx_config_main;
+
+ /* Number of bytes in a fib result. Must be at least
+ sizeof (uword). First word is always adjacency index. */
+ u32 fib_result_n_bytes, fib_result_n_words;
+
+ format_function_t * format_fib_result;
+
+ /* 1 for ip6; 0 for ip4. */
+ u32 is_ip6;
+
+ /* Either format_ip4_address_and_length or format_ip6_address_and_length. */
+ format_function_t * format_address_and_length;
+
+ /* Table mapping ip protocol to ip[46]-local node next index. */
+ u8 local_next_by_ip_protocol[256];
+
+ /* IP_BUILTIN_PROTOCOL_{TCP,UDP,ICMP,OTHER} by protocol in IP header. */
+ u8 builtin_protocol_by_ip_protocol[256];
+} ip_lookup_main_t;
+
+always_inline ip_adjacency_t *
+ip_get_adjacency (ip_lookup_main_t * lm,
+ u32 adj_index)
+{
+ ip_adjacency_t * adj;
+
+ adj = heap_elt_at_index (lm->adjacency_heap, adj_index);
+
+ ASSERT (! heap_is_free_handle (lm->adjacency_heap, adj->heap_handle));
+
+ return adj;
+}
+
+#define ip_prefetch_adjacency(lm,adj_index,type) \
+do { \
+ ip_adjacency_t * _adj = (lm)->adjacency_heap + (adj_index); \
+ CLIB_PREFETCH (_adj, sizeof (_adj[0]), type); \
+} while (0)
+
+always_inline void
+ip_call_add_del_adjacency_callbacks (ip_lookup_main_t * lm, u32 adj_index, u32 is_del)
+{
+ ip_adjacency_t * adj;
+ uword i;
+ adj = ip_get_adjacency (lm, adj_index);
+ for (i = 0; i < vec_len (lm->add_del_adjacency_callbacks); i++)
+ lm->add_del_adjacency_callbacks[i] (lm, adj_index, adj, is_del);
+}
+
+/* Create new block of given number of contiguous adjacencies. */
+ip_adjacency_t *
+ip_add_adjacency (ip_lookup_main_t * lm,
+ ip_adjacency_t * adj,
+ u32 n_adj,
+ u32 * adj_index_result);
+
+void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index);
+
+void
+ip_multipath_adjacency_free (ip_lookup_main_t * lm,
+ ip_multipath_adjacency_t * a);
+
+u32
+ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm,
+ u32 is_del,
+ u32 old_mp_adj_index,
+ u32 next_hop_adj_index,
+ u32 next_hop_weight,
+ u32 * new_mp_adj_index);
+
+clib_error_t *
+ip_interface_address_add_del (ip_lookup_main_t * lm,
+ u32 sw_if_index,
+ void * address,
+ u32 address_length,
+ u32 is_del,
+ u32 * result_index);
+
+always_inline ip_interface_address_t *
+ip_get_interface_address (ip_lookup_main_t * lm, void * addr_fib)
+{
+ uword * p = mhash_get (&lm->address_to_if_address_index, addr_fib);
+ return p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0;
+}
+
+always_inline void *
+ip_interface_address_get_address (ip_lookup_main_t * lm, ip_interface_address_t * a)
+{ return mhash_key_to_mem (&lm->address_to_if_address_index, a->address_key); }
+
+always_inline ip_interface_address_t *
+ip_interface_address_for_packet (ip_lookup_main_t * lm, vlib_buffer_t * b, u32 sw_if_index)
+{
+ ip_adjacency_t * adj;
+ u32 if_address_index;
+
+ adj = ip_get_adjacency (lm, vnet_buffer (b)->ip.adj_index[VLIB_TX]);
+
+ ASSERT (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP
+ || adj->lookup_next_index == IP_LOOKUP_NEXT_LOCAL);
+ if_address_index = adj->if_address_index;
+ if_address_index = (if_address_index == ~0 ?
+ vec_elt (lm->if_address_pool_index_by_sw_if_index, sw_if_index)
+ : if_address_index);
+
+ return pool_elt_at_index (lm->if_address_pool, if_address_index);
+}
+
+#define foreach_ip_interface_address(lm,a,sw_if_index,loop,body) \
+do { \
+ vnet_main_t *_vnm = vnet_get_main(); \
+ u32 _sw_if_index = sw_if_index; \
+ vnet_sw_interface_t *_swif; \
+ _swif = vnet_get_sw_interface (_vnm, _sw_if_index); \
+ \
+ /* \
+ * Loop => honor unnumbered interface addressing. \
+ */ \
+ if (loop && _swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED) \
+ _sw_if_index = _swif->unnumbered_sw_if_index; \
+ u32 _ia = \
+ (vec_len((lm)->if_address_pool_index_by_sw_if_index) \
+ > (_sw_if_index)) \
+ ? vec_elt ((lm)->if_address_pool_index_by_sw_if_index, \
+ (_sw_if_index)) : (u32)~0; \
+ ip_interface_address_t * _a; \
+ while (_ia != ~0) \
+ { \
+ _a = pool_elt_at_index ((lm)->if_address_pool, _ia); \
+ _ia = _a->next_this_sw_interface; \
+ (a) = _a; \
+ body; \
+ } \
+} while (0)
+
+void ip_lookup_init (ip_lookup_main_t * lm, u32 ip_lookup_node_index);
+
+serialize_function_t serialize_ip_lookup_main, unserialize_ip_lookup_main;
+serialize_function_t serialize_vec_ip_adjacency, unserialize_vec_ip_adjacency;
+
+#endif /* included_ip_lookup_h */
diff --git a/vnet/vnet/ip/ports.def b/vnet/vnet/ip/ports.def
new file mode 100644
index 00000000000..cdb754f5b2e
--- /dev/null
+++ b/vnet/vnet/ip/ports.def
@@ -0,0 +1,757 @@
+/*
+ * ip/ports.def: tcp/udp port definitions
+ *
+ * Eliot Dresselhaus
+ * August, 2005
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+PORT NUMBERS
+
+(last updated 18 October 2005)
+
+The port numbers are divided into three ranges: the Well Known Ports,
+the Registered Ports, and the Dynamic and/or Private Ports.
+
+The Well Known Ports are those from 0 through 1023.
+
+The Registered Ports are those from 1024 through 49151
+
+The Dynamic and/or Private Ports are those from 49152 through 65535
+
+
+************************************************************************
+* PLEASE NOTE THE FOLLOWING: *
+* *
+* 1. UNASSIGNED PORT NUMBERS SHOULD NOT BE USED. THE IANA WILL ASSIGN *
+* THE NUMBER FOR THE PORT AFTER YOUR APPLICATION HAS BEEN APPROVED. *
+* *
+* 2. ASSIGNMENT OF A PORT NUMBER DOES NOT IN ANY WAY IMPLY AN *
+* ENDORSEMENT OF AN APPLICATION OR PRODUCT, AND THE FACT THAT NETWORK *
+* TRAFFIC IS FLOWING TO OR FROM A REGISTERED PORT DOES NOT MEAN THAT *
+* IT IS "GOOD" TRAFFIC. FIREWALL AND SYSTEM ADMINISTRATORS SHOULD *
+* CHOOSE HOW TO CONFIGURE THEIR SYSTEMS BASED ON THEIR KNOWLEDGE OF *
+* THE TRAFFIC IN QUESTION, NOT WHETHER THERE IS A PORT NUMBER *
+* REGISTERED OR NOT. *
+************************************************************************
+
+
+WELL KNOWN PORT NUMBERS
+
+The Well Known Ports are assigned by the IANA and on most systems can
+only be used by system (or root) processes or by programs executed by
+privileged users.
+
+Ports are used in the TCP [RFC793] to name the ends of logical
+connections which carry long term conversations. For the purpose of
+providing services to unknown callers, a service contact port is
+defined. This list specifies the port used by the server process as
+its contact port. The contact port is sometimes called the
+"well-known port".
+
+To the extent possible, these same port assignments are used with the
+UDP [RFC768].
+
+The range for assigned ports managed by the IANA is 0-1023.
+*/
+ip_port (TCPMUX, 1)
+ip_port (COMPRESS_NET_MANAGEMENT, 2)
+ip_port (COMPRESS_NET, 3)
+ip_port (RJE, 5)
+ip_port (ECHO, 7)
+ip_port (DISCARD, 9)
+ip_port (SYSTAT, 11)
+ip_port (DAYTIME, 13)
+ip_port (QOTD, 17)
+ip_port (MSP, 18)
+ip_port (CHARGEN, 19)
+ip_port (FTP_DATA, 20)
+ip_port (FTP, 21)
+ip_port (SSH, 22)
+ip_port (TELNET, 23)
+ip_port (SMTP, 25)
+ip_port (NSW_FE, 27)
+ip_port (MSG_ICP, 29)
+ip_port (MSG_AUTH, 31)
+ip_port (DSP, 33)
+ip_port (TIME, 37)
+ip_port (RAP, 38)
+ip_port (RLP, 39)
+ip_port (GRAPHICS, 41)
+ip_port (NAME, 42)
+ip_port (NAMESERVER, 42)
+ip_port (NICNAME, 43)
+ip_port (MPM_FLAGS, 44)
+ip_port (MPM, 45)
+ip_port (MPM_SND, 46)
+ip_port (NI_FTP, 47)
+ip_port (AUDITD, 48)
+ip_port (TACACS, 49)
+ip_port (RE_MAIL_CK, 50)
+ip_port (LA_MAINT, 51)
+ip_port (XNS_TIME, 52)
+ip_port (DNS, 53)
+ip_port (XNS_CH, 54)
+ip_port (ISI_GL, 55)
+ip_port (XNS_AUTH, 56)
+ip_port (XNS_MAIL, 58)
+ip_port (NI_MAIL, 61)
+ip_port (ACAS, 62)
+ip_port (WHOIS_PLUS_PLUS, 63)
+ip_port (COVIA, 64)
+ip_port (TACACS_DS, 65)
+ip_port (ORACLE_SQL_NET, 66)
+ip_port (BOOTPS, 67)
+ip_port (BOOTPC, 68)
+ip_port (TFTP, 69)
+ip_port (GOPHER, 70)
+ip_port (NETRJS_1, 71)
+ip_port (NETRJS_2, 72)
+ip_port (NETRJS_3, 73)
+ip_port (NETRJS_4, 74)
+ip_port (DEOS, 76)
+ip_port (VETTCP, 78)
+ip_port (FINGER, 79)
+ip_port (WWW, 80)
+ip_port (HOSTS2_NS, 81)
+ip_port (XFER, 82)
+ip_port (MIT_ML_DEV, 83)
+ip_port (CTF, 84)
+ip_port (MIT_ML_DEV1, 85)
+ip_port (MFCOBOL, 86)
+ip_port (KERBEROS, 88)
+ip_port (SU_MIT_TG, 89)
+ip_port (DNSIX, 90)
+ip_port (MIT_DOV, 91)
+ip_port (NPP, 92)
+ip_port (DCP, 93)
+ip_port (OBJCALL, 94)
+ip_port (SUPDUP, 95)
+ip_port (DIXIE, 96)
+ip_port (SWIFT_RVF, 97)
+ip_port (TACNEWS, 98)
+ip_port (METAGRAM, 99)
+ip_port (NEWACCT, 100)
+ip_port (HOSTNAME, 101)
+ip_port (ISO_TSAP, 102)
+ip_port (GPPITNP, 103)
+ip_port (ACR_NEMA, 104)
+ip_port (CSO, 105)
+ip_port (CSNET_NS, 105)
+ip_port (3COM_TSMUX, 106)
+ip_port (RTELNET, 107)
+ip_port (SNAGAS, 108)
+ip_port (POP2, 109)
+ip_port (POP3, 110)
+ip_port (SUNRPC, 111)
+ip_port (MCIDAS, 112)
+ip_port (IDENT, 113)
+ip_port (SFTP, 115)
+ip_port (ANSANOTIFY, 116)
+ip_port (UUCP_PATH, 117)
+ip_port (SQLSERV, 118)
+ip_port (NNTP, 119)
+ip_port (CFDPTKT, 120)
+ip_port (ERPC, 121)
+ip_port (SMAKYNET, 122)
+ip_port (NTP, 123)
+ip_port (ANSATRADER, 124)
+ip_port (LOCUS_MAP, 125)
+ip_port (NXEDIT, 126)
+ip_port (LOCUS_CON, 127)
+ip_port (GSS_XLICEN, 128)
+ip_port (PWDGEN, 129)
+ip_port (CISCO_FNA, 130)
+ip_port (CISCO_TNA, 131)
+ip_port (CISCO_SYS, 132)
+ip_port (STATSRV, 133)
+ip_port (INGRES_NET, 134)
+ip_port (EPMAP, 135)
+ip_port (PROFILE, 136)
+ip_port (NETBIOS_NS, 137)
+ip_port (NETBIOS_DGM, 138)
+ip_port (NETBIOS_SSN, 139)
+ip_port (EMFIS_DATA, 140)
+ip_port (EMFIS_CNTL, 141)
+ip_port (BL_IDM, 142)
+ip_port (IMAP, 143)
+ip_port (UMA, 144)
+ip_port (UAAC, 145)
+ip_port (ISO_TP0, 146)
+ip_port (ISO_IP, 147)
+ip_port (JARGON, 148)
+ip_port (AED_512, 149)
+ip_port (SQL_NET, 150)
+ip_port (HEMS, 151)
+ip_port (BFTP, 152)
+ip_port (SGMP, 153)
+ip_port (NETSC_PROD, 154)
+ip_port (NETSC_DEV, 155)
+ip_port (SQLSRV, 156)
+ip_port (KNET_CMP, 157)
+ip_port (PCMAIL_SRV, 158)
+ip_port (NSS_ROUTING, 159)
+ip_port (SGMP_TRAPS, 160)
+ip_port (SNMP, 161)
+ip_port (SNMPTRAP, 162)
+ip_port (CMIP_MAN, 163)
+ip_port (CMIP_AGENT, 164)
+ip_port (XNS_COURIER, 165)
+ip_port (S_NET, 166)
+ip_port (NAMP, 167)
+ip_port (RSVD, 168)
+ip_port (SEND, 169)
+ip_port (PRINT_SRV, 170)
+ip_port (MULTIPLEX, 171)
+ip_port (CL1, 172)
+ip_port (XYPLEX_MUX, 173)
+ip_port (MAILQ, 174)
+ip_port (VMNET, 175)
+ip_port (GENRAD_MUX, 176)
+ip_port (XDMCP, 177)
+ip_port (NEXTSTEP, 178)
+ip_port (BGP, 179)
+ip_port (RIS, 180)
+ip_port (UNIFY, 181)
+ip_port (AUDIT, 182)
+ip_port (OCBINDER, 183)
+ip_port (OCSERVER, 184)
+ip_port (REMOTE_KIS, 185)
+ip_port (KIS, 186)
+ip_port (ACI, 187)
+ip_port (MUMPS, 188)
+ip_port (QFT, 189)
+ip_port (GACP, 190)
+ip_port (PROSPERO, 191)
+ip_port (OSU_NMS, 192)
+ip_port (SRMP, 193)
+ip_port (IRC, 194)
+ip_port (DN6_NLM_AUD, 195)
+ip_port (DN6_SMM_RED, 196)
+ip_port (DLS, 197)
+ip_port (DLS_MON, 198)
+ip_port (SMUX, 199)
+ip_port (SRC, 200)
+ip_port (AT_RTMP, 201)
+ip_port (AT_NBP, 202)
+ip_port (AT_3, 203)
+ip_port (AT_ECHO, 204)
+ip_port (AT_5, 205)
+ip_port (AT_ZIS, 206)
+ip_port (AT_7, 207)
+ip_port (AT_8, 208)
+ip_port (QMTP, 209)
+ip_port (Z39_50, 210)
+ip_port (TI914CG, 211)
+ip_port (ANET, 212)
+ip_port (IPX, 213)
+ip_port (VMPWSCS, 214)
+ip_port (SOFTPC, 215)
+ip_port (CAILIC, 216)
+ip_port (DBASE, 217)
+ip_port (MPP, 218)
+ip_port (UARPS, 219)
+ip_port (IMAP3, 220)
+ip_port (FLN_SPX, 221)
+ip_port (RSH_SPX, 222)
+ip_port (CDC, 223)
+ip_port (MASQDIALER, 224)
+ip_port (DIRECT, 242)
+ip_port (SUR_MEAS, 243)
+ip_port (INBUSINESS, 244)
+ip_port (LINK, 245)
+ip_port (DSP3270, 246)
+ip_port (SUBNTBCST_TFTP, 247)
+ip_port (BHFHS, 248)
+ip_port (RAP1, 256)
+ip_port (SET, 257)
+ip_port (YAK_CHAT, 258)
+ip_port (ESRO_GEN, 259)
+ip_port (OPENPORT, 260)
+ip_port (NSIIOPS, 261)
+ip_port (ARCISDMS, 262)
+ip_port (HDAP, 263)
+ip_port (BGMP, 264)
+ip_port (X_BONE_CTL, 265)
+ip_port (SST, 266)
+ip_port (TD_SERVICE, 267)
+ip_port (TD_REPLICA, 268)
+ip_port (HTTP_MGMT, 280)
+ip_port (PERSONAL_LINK, 281)
+ip_port (CABLEPORT_AX, 282)
+ip_port (RESCAP, 283)
+ip_port (CORERJD, 284)
+ip_port (FXP, 286)
+ip_port (K_BLOCK, 287)
+ip_port (NOVASTORBAKCUP, 308)
+ip_port (ENTRUSTTIME, 309)
+ip_port (BHMDS, 310)
+ip_port (ASIP_WEBADMIN, 311)
+ip_port (VSLMP, 312)
+ip_port (MAGENTA_LOGIC, 313)
+ip_port (OPALIS_ROBOT, 314)
+ip_port (DPSI, 315)
+ip_port (DECAUTH, 316)
+ip_port (ZANNET, 317)
+ip_port (PKIX_TIMESTAMP, 318)
+ip_port (PTP_EVENT, 319)
+ip_port (PTP_GENERAL, 320)
+ip_port (PIP, 321)
+ip_port (RTSPS, 322)
+ip_port (TEXAR, 333)
+ip_port (PDAP, 344)
+ip_port (PAWSERV, 345)
+ip_port (ZSERV, 346)
+ip_port (FATSERV, 347)
+ip_port (CSI_SGWP, 348)
+ip_port (MFTP, 349)
+ip_port (MATIP_TYPE_A, 350)
+ip_port (MATIP_TYPE_B, 351)
+ip_port (BHOETTY, 351)
+ip_port (DTAG_STE_SB, 352)
+ip_port (BHOEDAP4, 352)
+ip_port (NDSAUTH, 353)
+ip_port (BH611, 354)
+ip_port (DATEX_ASN, 355)
+ip_port (CLOANTO_NET_1, 356)
+ip_port (BHEVENT, 357)
+ip_port (SHRINKWRAP, 358)
+ip_port (NSRMP, 359)
+ip_port (SCOI2ODIALOG, 360)
+ip_port (SEMANTIX, 361)
+ip_port (SRSSEND, 362)
+ip_port (RSVP_TUNNEL, 363)
+ip_port (AURORA_CMGR, 364)
+ip_port (DTK, 365)
+ip_port (ODMR, 366)
+ip_port (MORTGAGEWARE, 367)
+ip_port (QBIKGDP, 368)
+ip_port (RPC2PORTMAP, 369)
+ip_port (CODAAUTH2, 370)
+ip_port (CLEARCASE, 371)
+ip_port (ULISTPROC, 372)
+ip_port (LEGENT_1, 373)
+ip_port (LEGENT_2, 374)
+ip_port (HASSLE, 375)
+ip_port (NIP, 376)
+ip_port (TNETOS, 377)
+ip_port (DSETOS, 378)
+ip_port (IS99C, 379)
+ip_port (IS99S, 380)
+ip_port (HP_COLLECTOR, 381)
+ip_port (HP_MANAGED_NODE, 382)
+ip_port (HP_ALARM_MGR, 383)
+ip_port (ARNS, 384)
+ip_port (IBM_APP, 385)
+ip_port (ASA, 386)
+ip_port (AURP, 387)
+ip_port (UNIDATA_LDM, 388)
+ip_port (LDAP, 389)
+ip_port (UIS, 390)
+ip_port (SYNOTICS_RELAY, 391)
+ip_port (SYNOTICS_BROKER, 392)
+ip_port (META5, 393)
+ip_port (EMBL_NDT, 394)
+ip_port (NETCP, 395)
+ip_port (NETWARE_IP, 396)
+ip_port (MPTN, 397)
+ip_port (KRYPTOLAN, 398)
+ip_port (ISO_TSAP_C2, 399)
+ip_port (WORK_SOL, 400)
+ip_port (UPS, 401)
+ip_port (GENIE, 402)
+ip_port (DECAP, 403)
+ip_port (NCED, 404)
+ip_port (NCLD, 405)
+ip_port (IMSP, 406)
+ip_port (TIMBUKTU, 407)
+ip_port (PRM_SM, 408)
+ip_port (PRM_NM, 409)
+ip_port (DECLADEBUG, 410)
+ip_port (RMT, 411)
+ip_port (SYNOPTICS_TRAP, 412)
+ip_port (SMSP, 413)
+ip_port (INFOSEEK, 414)
+ip_port (BNET, 415)
+ip_port (SILVERPLATTER, 416)
+ip_port (ONMUX, 417)
+ip_port (HYPER_G, 418)
+ip_port (ARIEL1, 419)
+ip_port (SMPTE, 420)
+ip_port (ARIEL2, 421)
+ip_port (ARIEL3, 422)
+ip_port (OPC_JOB_START, 423)
+ip_port (OPC_JOB_TRACK, 424)
+ip_port (ICAD_EL, 425)
+ip_port (SMARTSDP, 426)
+ip_port (SVRLOC, 427)
+ip_port (OCS_CMU, 428)
+ip_port (OCS_AMU, 429)
+ip_port (UTMPSD, 430)
+ip_port (UTMPCD, 431)
+ip_port (IASD, 432)
+ip_port (NNSP, 433)
+ip_port (MOBILEIP_AGENT, 434)
+ip_port (MOBILIP_MN, 435)
+ip_port (DNA_CML, 436)
+ip_port (COMSCM, 437)
+ip_port (DSFGW, 438)
+ip_port (DASP, 439)
+ip_port (SGCP, 440)
+ip_port (DECVMS_SYSMGT, 441)
+ip_port (CVC_HOSTD, 442)
+ip_port (HTTPS, 443)
+ip_port (SNPP, 444)
+ip_port (MICROSOFT_DS, 445)
+ip_port (DDM_RDB, 446)
+ip_port (DDM_DFM, 447)
+ip_port (DDM_SSL, 448)
+ip_port (AS_SERVERMAP, 449)
+ip_port (TSERVER, 450)
+ip_port (SFS_SMP_NET, 451)
+ip_port (SFS_CONFIG, 452)
+ip_port (CREATIVESERVER, 453)
+ip_port (CONTENTSERVER, 454)
+ip_port (CREATIVEPARTNR, 455)
+ip_port (MACON_TCP, 456)
+ip_port (SCOHELP, 457)
+ip_port (APPLEQTC, 458)
+ip_port (AMPR_RCMD, 459)
+ip_port (SKRONK, 460)
+ip_port (DATASURFSRV, 461)
+ip_port (DATASURFSRVSEC, 462)
+ip_port (ALPES, 463)
+ip_port (KPASSWD, 464)
+ip_port (URD, 465)
+ip_port (DIGITAL_VRC, 466)
+ip_port (MYLEX_MAPD, 467)
+ip_port (PHOTURIS, 468)
+ip_port (RCP, 469)
+ip_port (SCX_PROXY, 470)
+ip_port (MONDEX, 471)
+ip_port (LJK_LOGIN, 472)
+ip_port (HYBRID_POP, 473)
+ip_port (TN_TL_W1, 474)
+ip_port (TCPNETHASPSRV, 475)
+ip_port (TN_TL_FD1, 476)
+ip_port (SS7NS, 477)
+ip_port (SPSC, 478)
+ip_port (IAFSERVER, 479)
+ip_port (IAFDBASE, 480)
+ip_port (PH, 481)
+ip_port (BGS_NSI, 482)
+ip_port (ULPNET, 483)
+ip_port (INTEGRA_SME, 484)
+ip_port (POWERBURST, 485)
+ip_port (AVIAN, 486)
+ip_port (SAFT, 487)
+ip_port (GSS_HTTP, 488)
+ip_port (NEST_PROTOCOL, 489)
+ip_port (MICOM_PFS, 490)
+ip_port (GO_LOGIN, 491)
+ip_port (TICF_1, 492)
+ip_port (TICF_2, 493)
+ip_port (POV_RAY, 494)
+ip_port (INTECOURIER, 495)
+ip_port (PIM_RP_DISC, 496)
+ip_port (DANTZ, 497)
+ip_port (SIAM, 498)
+ip_port (ISO_ILL, 499)
+ip_port (ISAKMP, 500)
+ip_port (STMF, 501)
+ip_port (ASA_APPL_PROTO, 502)
+ip_port (INTRINSA, 503)
+ip_port (CITADEL, 504)
+ip_port (MAILBOX_LM, 505)
+ip_port (OHIMSRV, 506)
+ip_port (CRS, 507)
+ip_port (XVTTP, 508)
+ip_port (SNARE, 509)
+ip_port (FCP, 510)
+ip_port (PASSGO, 511)
+ip_port (EXEC, 512)
+ip_port (LOGIN, 513)
+ip_port (SHELL, 514)
+ip_port (PRINTER, 515)
+ip_port (VIDEOTEX, 516)
+ip_port (TALK, 517)
+ip_port (NTALK, 518)
+ip_port (UTIME, 519)
+ip_port (EFS, 520)
+ip_port (RIPNG, 521)
+ip_port (ULP, 522)
+ip_port (IBM_DB2, 523)
+ip_port (NCP, 524)
+ip_port (TIMED, 525)
+ip_port (TEMPO, 526)
+ip_port (STX, 527)
+ip_port (CUSTIX, 528)
+ip_port (IRC_SERV, 529)
+ip_port (COURIER, 530)
+ip_port (CONFERENCE, 531)
+ip_port (NETNEWS, 532)
+ip_port (NETWALL, 533)
+ip_port (MM_ADMIN, 534)
+ip_port (IIOP, 535)
+ip_port (OPALIS_RDV, 536)
+ip_port (NMSP, 537)
+ip_port (GDOMAP, 538)
+ip_port (APERTUS_LDP, 539)
+ip_port (UUCP, 540)
+ip_port (UUCP_RLOGIN, 541)
+ip_port (COMMERCE, 542)
+ip_port (KLOGIN, 543)
+ip_port (KSHELL, 544)
+ip_port (APPLEQTCSRVR, 545)
+ip_port (DHCPV6_CLIENT, 546)
+ip_port (DHCPV6_SERVER, 547)
+ip_port (AFPOVERTCP, 548)
+ip_port (IDFP, 549)
+ip_port (NEW_RWHO, 550)
+ip_port (CYBERCASH, 551)
+ip_port (DEVSHR_NTS, 552)
+ip_port (PIRP, 553)
+ip_port (RTSP, 554)
+ip_port (DSF, 555)
+ip_port (REMOTEFS, 556)
+ip_port (OPENVMS_SYSIPC, 557)
+ip_port (SDNSKMP, 558)
+ip_port (TEEDTAP, 559)
+ip_port (RMONITOR, 560)
+ip_port (MONITOR, 561)
+ip_port (CHSHELL, 562)
+ip_port (NNTPS, 563)
+ip_port (9PFS, 564)
+ip_port (WHOAMI, 565)
+ip_port (STREETTALK, 566)
+ip_port (BANYAN_RPC, 567)
+ip_port (MS_SHUTTLE, 568)
+ip_port (MS_ROME, 569)
+ip_port (METER, 570)
+ip_port (METER1, 571)
+ip_port (SONAR, 572)
+ip_port (BANYAN_VIP, 573)
+ip_port (FTP_AGENT, 574)
+ip_port (VEMMI, 575)
+ip_port (IPCD, 576)
+ip_port (VNAS, 577)
+ip_port (IPDD, 578)
+ip_port (DECBSRV, 579)
+ip_port (SNTP_HEARTBEAT, 580)
+ip_port (BDP, 581)
+ip_port (SCC_SECURITY, 582)
+ip_port (PHILIPS_VC, 583)
+ip_port (KEYSERVER, 584)
+ip_port (IMAP4_SSL, 585)
+ip_port (PASSWORD_CHG, 586)
+ip_port (SUBMISSION, 587)
+ip_port (CAL, 588)
+ip_port (EYELINK, 589)
+ip_port (TNS_CML, 590)
+ip_port (HTTP_ALT, 591)
+ip_port (EUDORA_SET, 592)
+ip_port (HTTP_RPC_EPMAP, 593)
+ip_port (TPIP, 594)
+ip_port (CAB_PROTOCOL, 595)
+ip_port (SMSD, 596)
+ip_port (PTCNAMESERVICE, 597)
+ip_port (SCO_WEBSRVRMG3, 598)
+ip_port (ACP, 599)
+ip_port (IPCSERVER, 600)
+ip_port (SYSLOG_CONN, 601)
+ip_port (XMLRPC_BEEP, 602)
+ip_port (IDXP, 603)
+ip_port (TUNNEL, 604)
+ip_port (SOAP_BEEP, 605)
+ip_port (URM, 606)
+ip_port (NQS, 607)
+ip_port (SIFT_UFT, 608)
+ip_port (NPMP_TRAP, 609)
+ip_port (NPMP_LOCAL, 610)
+ip_port (NPMP_GUI, 611)
+ip_port (HMMP_IND, 612)
+ip_port (HMMP_OP, 613)
+ip_port (SSHELL, 614)
+ip_port (SCO_INETMGR, 615)
+ip_port (SCO_SYSMGR, 616)
+ip_port (SCO_DTMGR, 617)
+ip_port (DEI_ICDA, 618)
+ip_port (COMPAQ_EVM, 619)
+ip_port (SCO_WEBSRVRMGR, 620)
+ip_port (ESCP_IP, 621)
+ip_port (COLLABORATOR, 622)
+ip_port (ASF_RMCP, 623)
+ip_port (CRYPTOADMIN, 624)
+ip_port (DEC_DLM, 625)
+ip_port (ASIA, 626)
+ip_port (PASSGO_TIVOLI, 627)
+ip_port (QMQP, 628)
+ip_port (3COM_AMP3, 629)
+ip_port (RDA, 630)
+ip_port (IPP, 631)
+ip_port (BMPP, 632)
+ip_port (SERVSTAT, 633)
+ip_port (GINAD, 634)
+ip_port (RLZDBASE, 635)
+ip_port (LDAPS, 636)
+ip_port (LANSERVER, 637)
+ip_port (MCNS_SEC, 638)
+ip_port (MSDP, 639)
+ip_port (ENTRUST_SPS, 640)
+ip_port (REPCMD, 641)
+ip_port (ESRO_EMSDP, 642)
+ip_port (SANITY, 643)
+ip_port (DWR, 644)
+ip_port (PSSC, 645)
+ip_port (LDP, 646)
+ip_port (DHCP_FAILOVER, 647)
+ip_port (RRP, 648)
+ip_port (CADVIEW_3D, 649)
+ip_port (OBEX, 650)
+ip_port (IEEE_MMS, 651)
+ip_port (HELLO_PORT, 652)
+ip_port (REPSCMD, 653)
+ip_port (AODV, 654)
+ip_port (TINC, 655)
+ip_port (SPMP, 656)
+ip_port (RMC, 657)
+ip_port (TENFOLD, 658)
+ip_port (MAC_SRVR_ADMIN, 660)
+ip_port (HAP, 661)
+ip_port (PFTP, 662)
+ip_port (PURENOISE, 663)
+ip_port (ASF_SECURE_RMCP, 664)
+ip_port (SUN_DR, 665)
+ip_port (MDQS, 666)
+ip_port (DOOM, 666)
+ip_port (DISCLOSE, 667)
+ip_port (MECOMM, 668)
+ip_port (MEREGISTER, 669)
+ip_port (VACDSM_SWS, 670)
+ip_port (VACDSM_APP, 671)
+ip_port (VPPS_QUA, 672)
+ip_port (CIMPLEX, 673)
+ip_port (ACAP, 674)
+ip_port (DCTP, 675)
+ip_port (VPPS_VIA, 676)
+ip_port (VPP, 677)
+ip_port (GGF_NCP, 678)
+ip_port (MRM, 679)
+ip_port (ENTRUST_AAAS, 680)
+ip_port (ENTRUST_AAMS, 681)
+ip_port (XFR, 682)
+ip_port (CORBA_IIOP, 683)
+ip_port (CORBA_IIOP_SSL, 684)
+ip_port (MDC_PORTMAPPER, 685)
+ip_port (HCP_WISMAR, 686)
+ip_port (ASIPREGISTRY, 687)
+ip_port (REALM_RUSD, 688)
+ip_port (NMAP, 689)
+ip_port (VATP, 690)
+ip_port (MSEXCH_ROUTING, 691)
+ip_port (HYPERWAVE_ISP, 692)
+ip_port (CONNENDP, 693)
+ip_port (HA_CLUSTER, 694)
+ip_port (IEEE_MMS_SSL, 695)
+ip_port (RUSHD, 696)
+ip_port (UUIDGEN, 697)
+ip_port (OLSR, 698)
+ip_port (ACCESSNETWORK, 699)
+ip_port (EPP, 700)
+ip_port (LMP, 701)
+ip_port (IRIS_BEEP, 702)
+ip_port (ELCSD, 704)
+ip_port (AGENTX, 705)
+ip_port (SILC, 706)
+ip_port (BORLAND_DSJ, 707)
+ip_port (ENTRUST_KMSH, 709)
+ip_port (ENTRUST_ASH, 710)
+ip_port (CISCO_TDP, 711)
+ip_port (TBRPF, 712)
+ip_port (NETVIEWDM1, 729)
+ip_port (NETVIEWDM2, 730)
+ip_port (NETVIEWDM3, 731)
+ip_port (NETGW, 741)
+ip_port (NETRCS, 742)
+ip_port (FLEXLM, 744)
+ip_port (FUJITSU_DEV, 747)
+ip_port (RIS_CM, 748)
+ip_port (KERBEROS_ADM, 749)
+ip_port (RFILE, 750)
+ip_port (PUMP, 751)
+ip_port (QRH, 752)
+ip_port (RRH, 753)
+ip_port (TELL, 754)
+ip_port (NLOGIN, 758)
+ip_port (CON, 759)
+ip_port (NS, 760)
+ip_port (RXE, 761)
+ip_port (QUOTAD, 762)
+ip_port (CYCLESERV, 763)
+ip_port (OMSERV, 764)
+ip_port (WEBSTER, 765)
+ip_port (PHONEBOOK, 767)
+ip_port (VID, 769)
+ip_port (CADLOCK, 770)
+ip_port (RTIP, 771)
+ip_port (CYCLESERV2, 772)
+ip_port (SUBMIT, 773)
+ip_port (RPASSWD, 774)
+ip_port (ENTOMB, 775)
+ip_port (WPAGES, 776)
+ip_port (MULTILING_HTTP, 777)
+ip_port (WPGS, 780)
+ip_port (MDBS_DAEMON, 800)
+ip_port (DEVICE, 801)
+ip_port (FCP_UDP, 810)
+ip_port (ITM_MCELL_S, 828)
+ip_port (PKIX_3_CA_RA, 829)
+ip_port (DHCP_FAILOVER2, 847)
+ip_port (GDOI, 848)
+ip_port (ISCSI, 860)
+ip_port (RSYNC, 873)
+ip_port (ICLCNET_LOCATE, 886)
+ip_port (ICLCNET_SVINFO, 887)
+ip_port (ACCESSBUILDER, 888)
+ip_port (CDDBP, 888)
+ip_port (OMGINITIALREFS, 900)
+ip_port (SMPNAMERES, 901)
+ip_port (IDEAFARM_CHAT, 902)
+ip_port (IDEAFARM_CATCH, 903)
+ip_port (XACT_BACKUP, 911)
+ip_port (APEX_MESH, 912)
+ip_port (APEX_EDGE, 913)
+ip_port (FTPS_DATA, 989)
+ip_port (FTPS, 990)
+ip_port (NAS, 991)
+ip_port (TELNETS, 992)
+ip_port (IMAPS, 993)
+ip_port (IRCS, 994)
+ip_port (POP3S, 995)
+ip_port (VSINET, 996)
+ip_port (MAITRD, 997)
+ip_port (BUSBOY, 998)
+ip_port (GARCON, 999)
+ip_port (PUPROUTER, 999)
+ip_port (CADLOCK2, 1000)
+ip_port (SURF, 1010)
+
diff --git a/vnet/vnet/ip/protocols.def b/vnet/vnet/ip/protocols.def
new file mode 100644
index 00000000000..77fab31da05
--- /dev/null
+++ b/vnet/vnet/ip/protocols.def
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Emacs editing mode -*-C-*-
+
+From http://www.iana.org/assignments/protocol-numbers
+
+PROTOCOL NUMBERS
+
+(last updated 18 October 2004)
+
+In the Internet Protocol version 4 (IPv4) [RFC791] there is a field,
+called "Protocol", to identify the next level protocol. This is an 8
+bit field. In Internet Protocol version 6 (IPv6) [RFC1883] this field
+is called the "Next Header" field.
+*/
+ip_protocol (0, IP6_HOP_BY_HOP_OPTIONS)
+ip_protocol (1, ICMP)
+ip_protocol (2, IGMP)
+ip_protocol (3, GGP)
+ip_protocol (4, IP_IN_IP)
+ip_protocol (5, ST)
+ip_protocol (6, TCP)
+ip_protocol (7, CBT)
+ip_protocol (8, EGP)
+ip_protocol (9, IGP)
+ip_protocol (10, BBN_RCC_MON)
+ip_protocol (11, NVP_II)
+ip_protocol (12, PUP)
+ip_protocol (13, ARGUS)
+ip_protocol (14, EMCON)
+ip_protocol (15, XNET)
+ip_protocol (16, CHAOS)
+ip_protocol (17, UDP)
+ip_protocol (18, MUX)
+ip_protocol (19, DCN_MEAS)
+ip_protocol (20, HMP)
+ip_protocol (21, PRM)
+ip_protocol (22, XNS_IDP)
+ip_protocol (23, TRUNK_1)
+ip_protocol (24, TRUNK_2)
+ip_protocol (25, LEAF_1)
+ip_protocol (26, LEAF_2)
+ip_protocol (27, RDP)
+ip_protocol (28, IRTP)
+ip_protocol (29, ISO_TP4)
+ip_protocol (30, NETBLT)
+ip_protocol (31, MFE_NSP)
+ip_protocol (32, MERIT_INP)
+ip_protocol (33, SEP)
+ip_protocol (34, 3PC)
+ip_protocol (35, IDPR)
+ip_protocol (36, XTP)
+ip_protocol (37, DDP)
+ip_protocol (38, IDPR_CMTP)
+ip_protocol (39, TP)
+ip_protocol (40, IL)
+ip_protocol (41, IPV6)
+ip_protocol (42, SDRP)
+ip_protocol (43, IPV6_ROUTE)
+ip_protocol (44, IPV6_FRAGMENTATION)
+ip_protocol (45, IDRP)
+ip_protocol (46, RSVP)
+ip_protocol (47, GRE)
+ip_protocol (48, MHRP)
+ip_protocol (49, BNA)
+ip_protocol (50, IPSEC_ESP)
+ip_protocol (51, IPSEC_AH)
+ip_protocol (52, I_NLSP)
+ip_protocol (53, SWIPE)
+ip_protocol (54, NARP)
+ip_protocol (55, MOBILE)
+ip_protocol (56, TLSP)
+ip_protocol (57, SKIP)
+ip_protocol (58, ICMP6)
+ip_protocol (59, IP6_NONXT)
+ip_protocol (60, IP6_DESTINATION_OPTIONS)
+ip_protocol (62, CFTP)
+ip_protocol (64, SAT_EXPAK)
+ip_protocol (65, KRYPTOLAN)
+ip_protocol (66, RVD)
+ip_protocol (67, IPPC)
+ip_protocol (69, SAT_MON)
+ip_protocol (70, VISA)
+ip_protocol (71, IPCV)
+ip_protocol (72, CPNX)
+ip_protocol (73, CPHB)
+ip_protocol (74, WSN)
+ip_protocol (75, PVP)
+ip_protocol (76, BR_SAT_MON)
+ip_protocol (77, SUN_ND)
+ip_protocol (78, WB_MON)
+ip_protocol (79, WB_EXPAK)
+ip_protocol (80, ISO_IP)
+ip_protocol (81, VMTP)
+ip_protocol (82, SECURE_VMTP)
+ip_protocol (83, VINES)
+ip_protocol (84, TTP)
+ip_protocol (85, NSFNET_IGP)
+ip_protocol (86, DGP)
+ip_protocol (87, TCF)
+ip_protocol (88, EIGRP)
+ip_protocol (89, OSPF)
+ip_protocol (90, SPRITE_RPC)
+ip_protocol (91, LARP)
+ip_protocol (92, MTP)
+ip_protocol (93, AX)
+ip_protocol (94, IPIP)
+ip_protocol (95, MICP)
+ip_protocol (96, SCC_SP)
+ip_protocol (97, ETHERIP)
+ip_protocol (98, ENCAP)
+ip_protocol (100, GMTP)
+ip_protocol (101, IFMP)
+ip_protocol (102, PNNI)
+ip_protocol (103, PIM)
+ip_protocol (104, ARIS)
+ip_protocol (105, SCPS)
+ip_protocol (106, QNX)
+ip_protocol (107, A)
+ip_protocol (108, IPCOMP)
+ip_protocol (109, SNP)
+ip_protocol (110, COMPAQ_PEER)
+ip_protocol (111, IPX_IN_IP)
+ip_protocol (112, VRRP)
+ip_protocol (113, PGM)
+ip_protocol (115, L2TP)
+ip_protocol (116, DDX)
+ip_protocol (117, IATP)
+ip_protocol (118, STP)
+ip_protocol (119, SRP)
+ip_protocol (120, UTI)
+ip_protocol (121, SMP)
+ip_protocol (122, SM)
+ip_protocol (123, PTP)
+ip_protocol (124, ISIS)
+ip_protocol (125, FIRE)
+ip_protocol (126, CRTP)
+ip_protocol (127, CRUDP)
+ip_protocol (128, SSCOPMCE)
+ip_protocol (129, IPLT)
+ip_protocol (130, SPS)
+ip_protocol (131, PIPE)
+ip_protocol (132, SCTP)
+ip_protocol (133, FC)
+ip_protocol (134, RSVP_E2E_IGNORE)
+ip_protocol (135, MOBILITY)
+ip_protocol (136, UDP_LITE)
+ip_protocol (137, MPLS_IN_IP)
+ip_protocol (255, RESERVED)
+
diff --git a/vnet/vnet/ip/tcp.c b/vnet/vnet/ip/tcp.c
new file mode 100644
index 00000000000..53f82f1c5b9
--- /dev/null
+++ b/vnet/vnet/ip/tcp.c
@@ -0,0 +1,2983 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp.c: tcp protocol
+ *
+ * Copyright (c) 2011 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/ip/tcp.h>
+#include <math.h>
+
+static u8 my_zero_mask_table[256] = {
+ [0xf0] = (1 << 1),
+ [0x0f] = (1 << 0),
+ [0xff] = (1 << 0) | (1 << 1),
+};
+
+static_always_inline u32 my_zero_mask (u32 x)
+{
+ return ((my_zero_mask_table[(x >> 0) & 0xff] << 0)
+ | (my_zero_mask_table[(x >> 8) & 0xff] << 2));
+}
+
+static u8 my_first_set_table[256] = {
+ [0x00] = 4,
+ [0xf0] = 1,
+ [0x0f] = 0,
+ [0xff] = 0,
+};
+
+static_always_inline u32 my_first_set (u32 zero_mask)
+{
+ u8 r0 = my_first_set_table[(zero_mask >> 0) & 0xff];
+ u8 r1 = 2 + my_first_set_table[(zero_mask >> 8) & 0xff];
+ return r0 != 4 ? r0 : r1;
+}
+
+static_always_inline void
+ip4_tcp_udp_address_x4_set_from_headers (ip4_tcp_udp_address_x4_t * a,
+ ip4_header_t * ip,
+ tcp_header_t * tcp,
+ u32 i)
+{
+ a->src.as_ip4_address[i] = ip->src_address;
+ a->dst.as_ip4_address[i] = ip->dst_address;
+ a->ports.as_ports[i].as_u32 = tcp->ports.src_and_dst;
+}
+
+static_always_inline void
+ip4_tcp_udp_address_x4_copy_and_invalidate (ip4_tcp_udp_address_x4_t * dst,
+ ip4_tcp_udp_address_x4_t * src,
+ u32 dst_i, u32 src_i)
+{
+#define _(d,s) d = s; s = 0;
+ _ (dst->src.as_ip4_address[dst_i].as_u32, src->src.as_ip4_address[src_i].as_u32);
+ _ (dst->dst.as_ip4_address[dst_i].as_u32, src->dst.as_ip4_address[src_i].as_u32);
+ _ (dst->ports.as_ports[dst_i].as_u32, src->ports.as_ports[src_i].as_u32);
+#undef _
+}
+
+static_always_inline void
+ip4_tcp_udp_address_x4_invalidate (ip4_tcp_udp_address_x4_t * a, u32 i)
+{
+ a->src.as_ip4_address[i].as_u32 = 0;
+ a->dst.as_ip4_address[i].as_u32 = 0;
+ a->ports.as_ports[i].as_u32 = 0;
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_is_valid (ip4_tcp_udp_address_x4_t * a, u32 i)
+{
+ return !(a->src.as_ip4_address[i].as_u32 == 0
+ && a->dst.as_ip4_address[i].as_u32 == 0
+ && a->ports.as_ports[i].as_u32 == 0);
+}
+
+#ifdef TCP_HAVE_VEC128
+static_always_inline uword
+ip4_tcp_udp_address_x4_match_helper (ip4_tcp_udp_address_x4_t * ax4,
+ u32x4 src, u32x4 dst, u32x4 ports)
+{
+ u32x4 r;
+ u32 m;
+
+ r = u32x4_is_equal (src, ax4->src.as_u32x4);
+ r &= u32x4_is_equal (dst, ax4->dst.as_u32x4);
+ r &= u32x4_is_equal (ports, ax4->ports.as_u32x4);
+
+ /* At this point r will be either all zeros (if nothing matched)
+ or have 32 1s in the position that did match. */
+ m = u8x16_compare_byte_mask ((u8x16) r);
+
+ return m;
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_match (ip4_tcp_udp_address_x4_t * ax4,
+ ip4_header_t * ip,
+ tcp_header_t * tcp)
+{
+ u32x4 src = u32x4_splat (ip->src_address.as_u32);
+ u32x4 dst = u32x4_splat (ip->dst_address.as_u32);
+ u32x4 ports = u32x4_splat (tcp->ports.src_and_dst);
+ return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, src, dst, ports));
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_first_empty (ip4_tcp_udp_address_x4_t * ax4)
+{
+ u32x4 zero = {0};
+ return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, zero, zero, zero));
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_empty_mask (ip4_tcp_udp_address_x4_t * ax4)
+{
+ u32x4 zero = {0};
+ return my_zero_mask (ip4_tcp_udp_address_x4_match_helper (ax4, zero, zero, zero));
+}
+#else /* TCP_HAVE_VEC128 */
+static_always_inline uword
+ip4_tcp_udp_address_x4_match_helper (ip4_tcp_udp_address_x4_t * ax4,
+ u32 src, u32 dst, u32 ports)
+{
+ u32 r0, r1, r2, r3;
+
+#define _(i) \
+ r##i = (src == ax4->src.as_ip4_address[i].as_u32 \
+ && dst == ax4->dst.as_ip4_address[i].as_u32 \
+ && ports == ax4->ports.as_ports[i].as_u32)
+
+ _ (0);
+ _ (1);
+ _ (2);
+ _ (3);
+
+#undef _
+
+ return (((r0 ? 0xf : 0x0) << 0)
+ | ((r1 ? 0xf : 0x0) << 4)
+ | ((r2 ? 0xf : 0x0) << 8)
+ | ((r3 ? 0xf : 0x0) << 12));
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_match (ip4_tcp_udp_address_x4_t * ax4,
+ ip4_header_t * ip,
+ tcp_header_t * tcp)
+{
+ return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4,
+ ip->src_address.as_u32,
+ ip->dst_address.as_u32,
+ tcp->ports.src_and_dst));
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_first_empty (ip4_tcp_udp_address_x4_t * ax4)
+{
+ return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, 0, 0, 0));
+}
+
+static_always_inline uword
+ip4_tcp_udp_address_x4_empty_mask (ip4_tcp_udp_address_x4_t * ax4)
+{
+ return my_zero_mask (ip4_tcp_udp_address_x4_match_helper (ax4, 0, 0, 0));
+}
+#endif
+
+static u8 * format_ip4_tcp_udp_address_x4 (u8 * s, va_list * va)
+{
+ ip4_tcp_udp_address_x4_t * a = va_arg (*va, ip4_tcp_udp_address_x4_t *);
+ u32 ai = va_arg (*va, u32);
+ ASSERT (ai < 4);
+
+ s = format (s, "%U:%d -> %U:%d",
+ format_ip4_address, &a->src.as_ip4_address[ai],
+ clib_net_to_host_u16 (a->ports.as_ports[ai].src),
+ format_ip4_address, &a->dst.as_ip4_address[ai],
+ clib_net_to_host_u16 (a->ports.as_ports[ai].dst));
+
+ return s;
+}
+
+static_always_inline void
+ip6_tcp_udp_address_x4_set_from_headers (ip6_tcp_udp_address_x4_t * a,
+ ip6_header_t * ip,
+ tcp_header_t * tcp,
+ u32 i)
+{
+ a->src.as_u32[0][i] = ip->src_address.as_u32[0];
+ a->src.as_u32[1][i] = ip->src_address.as_u32[1];
+ a->src.as_u32[2][i] = ip->src_address.as_u32[2];
+ a->src.as_u32[3][i] = ip->src_address.as_u32[3];
+ a->dst.as_u32[0][i] = ip->dst_address.as_u32[0];
+ a->dst.as_u32[1][i] = ip->dst_address.as_u32[1];
+ a->dst.as_u32[2][i] = ip->dst_address.as_u32[2];
+ a->dst.as_u32[3][i] = ip->dst_address.as_u32[3];
+ a->ports.as_ports[i].as_u32 = tcp->ports.src_and_dst;
+}
+
+static_always_inline void
+ip6_tcp_udp_address_x4_copy_and_invalidate (ip6_tcp_udp_address_x4_t * dst,
+ ip6_tcp_udp_address_x4_t * src,
+ u32 dst_i, u32 src_i)
+{
+#define _(d,s) d = s; s = 0;
+ _ (dst->src.as_u32[0][dst_i], src->src.as_u32[0][src_i]);
+ _ (dst->src.as_u32[1][dst_i], src->src.as_u32[1][src_i]);
+ _ (dst->src.as_u32[2][dst_i], src->src.as_u32[2][src_i]);
+ _ (dst->src.as_u32[3][dst_i], src->src.as_u32[3][src_i]);
+ _ (dst->dst.as_u32[0][dst_i], src->dst.as_u32[0][src_i]);
+ _ (dst->dst.as_u32[1][dst_i], src->dst.as_u32[1][src_i]);
+ _ (dst->dst.as_u32[2][dst_i], src->dst.as_u32[2][src_i]);
+ _ (dst->dst.as_u32[3][dst_i], src->dst.as_u32[3][src_i]);
+ _ (dst->ports.as_ports[dst_i].as_u32, src->ports.as_ports[src_i].as_u32);
+#undef _
+}
+
+static_always_inline void
+ip6_tcp_udp_address_x4_invalidate (ip6_tcp_udp_address_x4_t * a, u32 i)
+{
+ a->src.as_u32[0][i] = 0;
+ a->src.as_u32[1][i] = 0;
+ a->src.as_u32[2][i] = 0;
+ a->src.as_u32[3][i] = 0;
+ a->dst.as_u32[0][i] = 0;
+ a->dst.as_u32[1][i] = 0;
+ a->dst.as_u32[2][i] = 0;
+ a->dst.as_u32[3][i] = 0;
+ a->ports.as_ports[i].as_u32 = 0;
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_is_valid (ip6_tcp_udp_address_x4_t * a, u32 i)
+{
+ return !(a->src.as_u32[0][i] == 0
+ && a->src.as_u32[1][i] == 0
+ && a->src.as_u32[2][i] == 0
+ && a->src.as_u32[3][i] == 0
+ && a->dst.as_u32[0][i] == 0
+ && a->dst.as_u32[1][i] == 0
+ && a->dst.as_u32[2][i] == 0
+ && a->dst.as_u32[3][i] == 0
+ && a->ports.as_ports[i].as_u32 == 0);
+}
+
+#ifdef TCP_HAVE_VEC128
+static_always_inline uword
+ip6_tcp_udp_address_x4_match_helper (ip6_tcp_udp_address_x4_t * ax4,
+ u32x4 src0, u32x4 src1, u32x4 src2, u32x4 src3,
+ u32x4 dst0, u32x4 dst1, u32x4 dst2, u32x4 dst3,
+ u32x4 ports)
+{
+ u32x4 r;
+ u32 m;
+
+ r = u32x4_is_equal (src0, ax4->src.as_u32x4[0]);
+ r &= u32x4_is_equal (src1, ax4->src.as_u32x4[1]);
+ r &= u32x4_is_equal (src2, ax4->src.as_u32x4[2]);
+ r &= u32x4_is_equal (src3, ax4->src.as_u32x4[3]);
+ r &= u32x4_is_equal (dst0, ax4->dst.as_u32x4[0]);
+ r &= u32x4_is_equal (dst1, ax4->dst.as_u32x4[1]);
+ r &= u32x4_is_equal (dst2, ax4->dst.as_u32x4[2]);
+ r &= u32x4_is_equal (dst3, ax4->dst.as_u32x4[3]);
+ r &= u32x4_is_equal (ports, ax4->ports.as_u32x4);
+
+ /* At this point r will be either all zeros (if nothing matched)
+ or have 32 1s in the position that did match. */
+ m = u8x16_compare_byte_mask ((u8x16) r);
+
+ return m;
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_match (ip6_tcp_udp_address_x4_t * ax4,
+ ip6_header_t * ip,
+ tcp_header_t * tcp)
+{
+ u32x4 src0 = u32x4_splat (ip->src_address.as_u32[0]);
+ u32x4 src1 = u32x4_splat (ip->src_address.as_u32[1]);
+ u32x4 src2 = u32x4_splat (ip->src_address.as_u32[2]);
+ u32x4 src3 = u32x4_splat (ip->src_address.as_u32[3]);
+ u32x4 dst0 = u32x4_splat (ip->dst_address.as_u32[0]);
+ u32x4 dst1 = u32x4_splat (ip->dst_address.as_u32[1]);
+ u32x4 dst2 = u32x4_splat (ip->dst_address.as_u32[2]);
+ u32x4 dst3 = u32x4_splat (ip->dst_address.as_u32[3]);
+ u32x4 ports = u32x4_splat (tcp->ports.src_and_dst);
+ return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4,
+ src0, src1, src2, src3,
+ dst0, dst1, dst2, dst3,
+ ports));
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_first_empty (ip6_tcp_udp_address_x4_t * ax4)
+{
+ u32x4 zero = {0};
+ return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4,
+ zero, zero, zero, zero,
+ zero, zero, zero, zero,
+ zero));
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_empty_mask (ip6_tcp_udp_address_x4_t * ax4)
+{
+ u32x4 zero = {0};
+ return my_zero_mask (ip6_tcp_udp_address_x4_match_helper (ax4,
+ zero, zero, zero, zero,
+ zero, zero, zero, zero,
+ zero));
+}
+#else /* TCP_HAVE_VEC128 */
+static_always_inline uword
+ip6_tcp_udp_address_x4_match_helper (ip6_tcp_udp_address_x4_t * ax4,
+ u32 src0, u32 src1, u32 src2, u32 src3,
+ u32 dst0, u32 dst1, u32 dst2, u32 dst3,
+ u32 ports)
+{
+ u32 r0, r1, r2, r3;
+
+#define _(i) \
+ r##i = (src0 == ax4->src.as_u32[i][0] \
+ && src1 == ax4->src.as_u32[i][1] \
+ && src2 == ax4->src.as_u32[i][2] \
+ && src3 == ax4->src.as_u32[i][3] \
+ && dst0 == ax4->dst.as_u32[i][0] \
+ && dst1 == ax4->dst.as_u32[i][1] \
+ && dst2 == ax4->dst.as_u32[i][2] \
+ && dst3 == ax4->dst.as_u32[i][3] \
+ && ports == ax4->ports.as_ports[i].as_u32)
+
+ _ (0);
+ _ (1);
+ _ (2);
+ _ (3);
+
+#undef _
+
+ return (((r0 ? 0xf : 0x0) << 0)
+ | ((r1 ? 0xf : 0x0) << 4)
+ | ((r2 ? 0xf : 0x0) << 8)
+ | ((r3 ? 0xf : 0x0) << 12));
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_match (ip6_tcp_udp_address_x4_t * ax4,
+ ip6_header_t * ip,
+ tcp_header_t * tcp)
+{
+ u32 src0 = ip->src_address.as_u32[0];
+ u32 src1 = ip->src_address.as_u32[1];
+ u32 src2 = ip->src_address.as_u32[2];
+ u32 src3 = ip->src_address.as_u32[3];
+ u32 dst0 = ip->dst_address.as_u32[0];
+ u32 dst1 = ip->dst_address.as_u32[1];
+ u32 dst2 = ip->dst_address.as_u32[2];
+ u32 dst3 = ip->dst_address.as_u32[3];
+ u32 ports = tcp->ports.src_and_dst;
+ return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4,
+ src0, src1, src2, src3,
+ dst0, dst1, dst2, dst3,
+ ports));
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_first_empty (ip6_tcp_udp_address_x4_t * ax4)
+{
+ return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0));
+}
+
+static_always_inline uword
+ip6_tcp_udp_address_x4_empty_mask (ip6_tcp_udp_address_x4_t * ax4)
+{
+ return my_zero_mask (ip6_tcp_udp_address_x4_match_helper (ax4,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0));
+}
+#endif /* ! TCP_HAVE_VEC128 */
+
+static u8 * format_ip6_tcp_udp_address_x4 (u8 * s, va_list * va)
+{
+ ip6_tcp_udp_address_x4_t * a = va_arg (*va, ip6_tcp_udp_address_x4_t *);
+ u32 i, ai = va_arg (*va, u32);
+ ip6_address_t src, dst;
+
+ ASSERT (ai < 4);
+ for (i = 0; i < 4; i++)
+ {
+ src.as_u32[i] = a->src.as_u32[i][ai];
+ dst.as_u32[i] = a->dst.as_u32[i][ai];
+ }
+
+ s = format (s, "%U:%d -> %U:%d",
+ format_ip6_address, &src,
+ clib_net_to_host_u16 (a->ports.as_ports[ai].src),
+ format_ip6_address, &dst,
+ clib_net_to_host_u16 (a->ports.as_ports[ai].dst));
+
+ return s;
+}
+
+static_always_inline u32
+find_oldest_timestamp_x4 (u32 * time_stamps, u32 now)
+{
+ u32 dt0, dt_min0, i_min0;
+ u32 dt1, dt_min1, i_min1;
+
+ i_min0 = i_min1 = 0;
+ dt_min0 = now - time_stamps[0];
+ dt_min1 = now - time_stamps[2];
+ dt0 = now - time_stamps[1];
+ dt1 = now - time_stamps[3];
+
+ i_min0 += dt0 > dt_min0;
+ i_min1 += dt1 > dt_min1;
+
+ dt_min0 = i_min0 > 0 ? dt0 : dt_min0;
+ dt_min1 = i_min1 > 0 ? dt1 : dt_min1;
+
+ return dt_min0 > dt_min1 ? i_min0 : (2 + i_min1);
+}
+
+static_always_inline uword
+tcp_round_trip_time_stats_is_valid (tcp_round_trip_time_stats_t * s)
+{ return s->count > 0; }
+
+static_always_inline void
+tcp_round_trip_time_stats_compute (tcp_round_trip_time_stats_t * s, f64 * r)
+{
+ f64 ave, rms;
+ ASSERT (s->count > 0);
+ ave = s->sum / s->count;
+ rms = sqrt (s->sum2 / s->count - ave*ave);
+ r[0] = ave;
+ r[1] = rms;
+}
+
+typedef struct {
+ tcp_option_type_t type : 8;
+ u8 length;
+ u32 my_time_stamp, his_time_stamp;
+} __attribute__ ((packed)) tcp_time_stamp_option_t;
+
+typedef struct {
+ tcp_header_t header;
+
+ struct {
+ struct {
+ tcp_option_type_t type : 8;
+ u8 length;
+ u16 value;
+ } mss;
+
+ struct {
+ tcp_option_type_t type : 8;
+ u8 length;
+ u8 value;
+ } __attribute__ ((packed)) window_scale;
+
+ u8 nops[3];
+
+ tcp_time_stamp_option_t time_stamp;
+ } __attribute__ ((packed)) options;
+} __attribute__ ((packed)) tcp_syn_packet_t;
+
+typedef struct {
+ tcp_header_t header;
+
+ struct {
+ u8 nops[2];
+
+ tcp_time_stamp_option_t time_stamp;
+ } options;
+} __attribute__ ((packed)) tcp_ack_packet_t;
+
+typedef struct {
+ ip4_header_t ip4;
+ tcp_syn_packet_t tcp;
+} ip4_tcp_syn_packet_t;
+
+typedef struct {
+ ip4_header_t ip4;
+ tcp_ack_packet_t tcp;
+} ip4_tcp_ack_packet_t;
+
+typedef struct {
+ ip6_header_t ip6;
+ tcp_syn_packet_t tcp;
+} ip6_tcp_syn_packet_t;
+
+typedef struct {
+ ip6_header_t ip6;
+ tcp_ack_packet_t tcp;
+} ip6_tcp_ack_packet_t;
+
+static_always_inline void
+ip4_tcp_packet_init (ip4_header_t * ip, u32 n_bytes)
+{
+ ip->ip_version_and_header_length = 0x45;
+
+ ip->tos = ip4_main.host_config.tos;
+ ip->ttl = ip4_main.host_config.ttl;
+
+ /* No need to set fragment ID due to DF bit. */
+ ip->flags_and_fragment_offset = clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT);
+
+ ip->protocol = IP_PROTOCOL_TCP;
+
+ ip->length = clib_host_to_net_u16 (n_bytes);
+
+ ip->checksum = ip4_header_checksum (ip);
+}
+
+static_always_inline void
+ip6_tcp_packet_init (ip6_header_t * ip, u32 n_bytes)
+{
+ ip->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28);
+
+ ip->payload_length = clib_host_to_net_u16 (n_bytes - sizeof (ip[0]));
+
+ ip->hop_limit = ip6_main.host_config.ttl;
+}
+
+static_always_inline u32
+tcp_time_now (tcp_main_t * tm, tcp_timer_type_t t)
+{
+ ASSERT (t < ARRAY_LEN (tm->log2_clocks_per_tick));
+ return clib_cpu_time_now () >> tm->log2_clocks_per_tick[t];
+}
+
+static void
+tcp_time_init (vlib_main_t * vm, tcp_main_t * tm)
+{
+ int i;
+ f64 log2 = .69314718055994530941;
+
+ for (i = 0; i < ARRAY_LEN (tm->log2_clocks_per_tick); i++)
+ {
+ static f64 t[] = {
+#define _(f,r) r,
+ foreach_tcp_timer
+#undef _
+ };
+ tm->log2_clocks_per_tick[i] =
+ flt_round_nearest (log (t[i] / vm->clib_time.seconds_per_clock) / log2);
+ tm->secs_per_tick[i] = vm->clib_time.seconds_per_clock * (1 << tm->log2_clocks_per_tick[i]);
+ }
+}
+
+tcp_main_t tcp_main;
+
+typedef enum {
+ TCP_LOOKUP_NEXT_DROP,
+ TCP_LOOKUP_NEXT_PUNT,
+ TCP_LOOKUP_NEXT_LISTEN_SYN,
+ TCP_LOOKUP_NEXT_LISTEN_ACK,
+ TCP_LOOKUP_NEXT_CONNECT_SYN_ACK,
+ TCP_LOOKUP_NEXT_ESTABLISHED,
+ TCP_LOOKUP_N_NEXT,
+} tcp_lookup_next_t;
+
+#define foreach_tcp_error \
+ _ (NONE, "no error") \
+ _ (LOOKUP_DROPS, "lookup drops") \
+ _ (LISTEN_RESPONSES, "listen responses sent") \
+ _ (CONNECTS_SENT, "connects sent") \
+ _ (LISTENS_ESTABLISHED, "listens connected") \
+ _ (UNEXPECTED_SEQ_NUMBER, "unexpected sequence number drops") \
+ _ (UNEXPECTED_ACK_NUMBER, "unexpected acknowledgment number drops") \
+ _ (CONNECTS_ESTABLISHED, "connects established") \
+ _ (NO_LISTENER_FOR_PORT, "no listener for port") \
+ _ (WRONG_LOCAL_ADDRESS_FOR_PORT, "wrong local address for port") \
+ _ (ACKS_SENT, "acks sent for established connections") \
+ _ (NO_DATA, "acks with no data") \
+ _ (FINS_RECEIVED, "fins received") \
+ _ (SEGMENT_AFTER_FIN, "segments dropped after fin received") \
+ _ (CONNECTIONS_CLOSED, "connections closed")
+
+typedef enum {
+#define _(sym,str) TCP_ERROR_##sym,
+ foreach_tcp_error
+#undef _
+ TCP_N_ERROR,
+} tcp_error_t;
+
+#ifdef TCP_HAVE_VEC128
+static_always_inline u32x4 u32x4_splat_x2 (u32 x)
+{
+ u32x4 r = u32x4_set0 (x);
+ return u32x4_interleave_lo (r, r);
+}
+
+static_always_inline u32x4 u32x4_set_x2 (u32 x, u32 y)
+{
+ u32x4 r0 = u32x4_set0 (x);
+ u32x4 r1 = u32x4_set0 (y);
+ return u32x4_interleave_lo (r0, r1);
+}
+
+/* FIXME */
+#define u32x4_get(x,i) \
+ __builtin_ia32_vec_ext_v4si ((i32x4) (x), (int) (i))
+#else /* TCP_HAVE_VEC128 */
+#endif /* TCP_HAVE_VEC128 */
+
+/* Dispatching on tcp/udp listeners (by dst port)
+ and tcp/udp connections (by src/dst address/port). */
+static_always_inline uword
+ip46_tcp_lookup (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next, mini_now;
+ vlib_node_runtime_t * error_node = node;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+ mini_now = tcp_time_now (tm, TCP_TIMER_mini_connection);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip60;
+ ip4_header_t * ip40;
+ tcp_header_t * tcp0;
+ u32 bi0, imin0, iest0, li0;
+ tcp_connection_state_t state0;
+ u8 error0, next0;
+ u8 min_match0, est_match0, is_min_match0, is_est_match0;
+ u8 min_oldest0, est_first_empty0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+
+#ifdef TCP_HAVE_VEC128
+ {
+ u32x4 a0, b0, c0;
+
+ a0 = tm->connection_hash_seeds[is_ip6][0].as_u32x4;
+ b0 = tm->connection_hash_seeds[is_ip6][1].as_u32x4;
+ c0 = tm->connection_hash_seeds[is_ip6][2].as_u32x4;
+
+ if (is_ip6)
+ {
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+
+ a0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[0]);
+ b0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[1]);
+ c0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[2]);
+
+ hash_v3_mix_u32x (a0, b0, c0);
+
+ a0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[3]);
+ b0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[0]);
+ c0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[1]);
+
+ hash_v3_mix_u32x (a0, b0, c0);
+
+ a0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[2]);
+ b0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[3]);
+ c0 ^= u32x4_splat_x2 (tcp0->ports.src_and_dst);
+ }
+ else
+ {
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+
+ a0 ^= u32x4_splat_x2 (ip40->src_address.as_u32);
+ b0 ^= u32x4_splat_x2 (ip40->dst_address.as_u32);
+ c0 ^= u32x4_splat_x2 (tcp0->ports.src_and_dst);
+ }
+
+ hash_v3_finalize_u32x (a0, b0, c0);
+
+ c0 &= tm->connection_hash_masks[is_ip6].as_u32x4;
+
+ imin0 = u32x4_get0 (c0);
+ iest0 = u32x4_get (c0, 1);
+ }
+#else
+ {
+ u32 a00, a01, b00, b01, c00, c01;
+
+ a00 = tm->connection_hash_seeds[is_ip6][0].as_u32[0];
+ a01 = tm->connection_hash_seeds[is_ip6][0].as_u32[1];
+ b00 = tm->connection_hash_seeds[is_ip6][1].as_u32[0];
+ b01 = tm->connection_hash_seeds[is_ip6][1].as_u32[1];
+ c00 = tm->connection_hash_seeds[is_ip6][2].as_u32[0];
+ c01 = tm->connection_hash_seeds[is_ip6][2].as_u32[1];
+
+ if (is_ip6)
+ {
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+
+ a00 ^= ip60->src_address.as_u32[0];
+ a01 ^= ip60->src_address.as_u32[0];
+ b00 ^= ip60->src_address.as_u32[1];
+ b01 ^= ip60->src_address.as_u32[1];
+ c00 ^= ip60->src_address.as_u32[2];
+ c01 ^= ip60->src_address.as_u32[2];
+
+ hash_v3_mix32 (a00, b00, c00);
+ hash_v3_mix32 (a01, b01, c01);
+
+ a00 ^= ip60->src_address.as_u32[3];
+ a01 ^= ip60->src_address.as_u32[3];
+ b00 ^= ip60->dst_address.as_u32[0];
+ b01 ^= ip60->dst_address.as_u32[0];
+ c00 ^= ip60->dst_address.as_u32[1];
+ c01 ^= ip60->dst_address.as_u32[1];
+
+ hash_v3_mix32 (a00, b00, c00);
+ hash_v3_mix32 (a01, b01, c01);
+
+ a00 ^= ip60->dst_address.as_u32[2];
+ a01 ^= ip60->dst_address.as_u32[2];
+ b00 ^= ip60->dst_address.as_u32[3];
+ b01 ^= ip60->dst_address.as_u32[3];
+ c00 ^= tcp0->ports.src_and_dst;
+ c01 ^= tcp0->ports.src_and_dst;
+ }
+ else
+ {
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+
+ a00 ^= ip40->src_address.as_u32;
+ a01 ^= ip40->src_address.as_u32;
+ b00 ^= ip40->dst_address.as_u32;
+ b01 ^= ip40->dst_address.as_u32;
+ c00 ^= tcp0->ports.src_and_dst;
+ c01 ^= tcp0->ports.src_and_dst;
+ }
+
+ hash_v3_finalize32 (a00, b00, c00);
+ hash_v3_finalize32 (a01, b01, c01);
+
+ c00 &= tm->connection_hash_masks[is_ip6].as_u32[0];
+ c01 &= tm->connection_hash_masks[is_ip6].as_u32[1];
+
+ imin0 = c00;
+ iest0 = c01;
+ }
+#endif
+
+ if (is_ip6)
+ {
+ ip6_tcp_udp_address_x4_and_timestamps_t * mina0;
+ ip6_tcp_udp_address_x4_t * esta0;
+
+ mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0);
+ esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest0);
+
+ min_match0 = ip6_tcp_udp_address_x4_match (&mina0->address_x4, ip60, tcp0);
+ est_match0 = ip6_tcp_udp_address_x4_match (esta0, ip60, tcp0);
+
+ min_oldest0 = find_oldest_timestamp_x4 (mina0->time_stamps, mini_now);
+ est_first_empty0 = ip6_tcp_udp_address_x4_first_empty (esta0);
+
+ if (PREDICT_FALSE (! est_match0 && est_first_empty0 >= 4 && ! min_match0))
+ {
+ /* Lookup in overflow hash. */
+ ASSERT (0);
+ }
+ }
+ else
+ {
+ ip4_tcp_udp_address_x4_and_timestamps_t * mina0;
+ ip4_tcp_udp_address_x4_t * esta0;
+
+ mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0);
+ esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest0);
+
+ min_match0 = ip4_tcp_udp_address_x4_match (&mina0->address_x4, ip40, tcp0);
+ est_match0 = ip4_tcp_udp_address_x4_match (esta0, ip40, tcp0);
+
+ min_oldest0 = find_oldest_timestamp_x4 (mina0->time_stamps, mini_now);
+ est_first_empty0 = ip4_tcp_udp_address_x4_first_empty (esta0);
+
+ if (PREDICT_FALSE (! est_match0 && est_first_empty0 >= 4 && ! min_match0))
+ {
+ /* Lookup in overflow hash. */
+ ASSERT (0);
+ }
+ }
+
+ is_min_match0 = min_match0 < 4;
+ is_est_match0 = est_match0 < 4;
+
+ imin0 = 4 * imin0 + (is_min_match0 ? min_match0 : min_oldest0);
+ iest0 = 4 * iest0 + (is_est_match0 ? est_match0 : est_first_empty0);
+
+ /* Should simultaneously not match both in mini and established connection tables. */
+ ASSERT (! (is_min_match0 && is_est_match0));
+
+ {
+ tcp_mini_connection_t * min0;
+ tcp_connection_t * est0;
+ tcp_sequence_pair_t * seq_pair0;
+ u8 flags0;
+
+ min0 = vec_elt_at_index (tm46->mini_connections, imin0);
+ est0 = vec_elt_at_index (tm46->established_connections, iest0);
+
+ if (min_match0 < 4)
+ {
+ ASSERT (min0->state != TCP_CONNECTION_STATE_unused);
+ ASSERT (min0->state != TCP_CONNECTION_STATE_established);
+ }
+
+ seq_pair0 = is_min_match0 ? &min0->sequence_numbers : &est0->sequence_numbers;
+
+ state0 = is_min_match0 ? min0->state : TCP_CONNECTION_STATE_unused;
+ state0 = is_est_match0 ? TCP_CONNECTION_STATE_established : state0;
+
+ vnet_buffer (p0)->ip.tcp.established_connection_index = iest0;
+ vnet_buffer (p0)->ip.tcp.mini_connection_index = imin0;
+ vnet_buffer (p0)->ip.tcp.listener_index = li0 = tm->listener_index_by_dst_port[tcp0->ports.dst];
+
+ flags0 = tcp0->flags & (TCP_FLAG_SYN | TCP_FLAG_ACK | TCP_FLAG_RST | TCP_FLAG_FIN);
+
+ next0 = tm->disposition_by_state_and_flags[state0][flags0].next;
+ error0 = tm->disposition_by_state_and_flags[state0][flags0].error;
+
+ next0 = li0 != 0 ? next0 : TCP_LOOKUP_NEXT_PUNT;
+ error0 = li0 != 0 ? error0 : TCP_ERROR_NO_LISTENER_FOR_PORT;
+ }
+
+ p0->error = error_node->errors[error0];
+
+ if (PREDICT_FALSE (next0 != next))
+ {
+ to_next -= 1;
+ n_left_to_next += 1;
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+
+ next = next0;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ /* FIXME */ ;
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_tcp_lookup (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_lookup (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_lookup (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_lookup (vm, node, frame, /* is_ip6 */ 1); }
+
+static void
+ip46_size_hash_tables (ip46_tcp_main_t * m)
+{
+ m->mini_connection_hash_mask = pow2_mask (m->log2_n_mini_connection_hash_elts);
+ vec_validate_aligned (m->mini_connections,
+ m->mini_connection_hash_mask,
+ CLIB_CACHE_LINE_BYTES);
+
+ m->established_connection_hash_mask = pow2_mask (m->log2_n_established_connection_hash_elts);
+ vec_validate_aligned (m->established_connections,
+ m->established_connection_hash_mask,
+ CLIB_CACHE_LINE_BYTES);
+}
+
+static void
+ip46_tcp_lookup_init (vlib_main_t * vm, tcp_main_t * tm, int is_ip6)
+{
+ ip46_tcp_main_t * m = is_ip6 ? &tm->ip6 : &tm->ip4;
+
+ m->is_ip6 = is_ip6;
+
+ m->log2_n_mini_connection_hash_elts = 8;
+ m->log2_n_established_connection_hash_elts = 8;
+ ip46_size_hash_tables (m);
+
+ if (is_ip6)
+ {
+ vec_validate_aligned (tm->ip6_mini_connection_address_hash,
+ m->mini_connection_hash_mask / 4,
+ CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (tm->ip6_established_connection_address_hash,
+ m->established_connection_hash_mask / 4,
+ CLIB_CACHE_LINE_BYTES);
+ }
+ else
+ {
+ vec_validate_aligned (tm->ip4_mini_connection_address_hash,
+ m->mini_connection_hash_mask / 4,
+ CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (tm->ip4_established_connection_address_hash,
+ m->established_connection_hash_mask / 4,
+ CLIB_CACHE_LINE_BYTES);
+ }
+ tm->connection_hash_masks[is_ip6].as_u32[0] = m->mini_connection_hash_mask / 4;
+ tm->connection_hash_masks[is_ip6].as_u32[1] = m->established_connection_hash_mask / 4;
+}
+
+static void
+tcp_lookup_init (vlib_main_t * vm, tcp_main_t * tm)
+{
+ int is_ip6;
+
+ /* Initialize hash seeds. */
+ for (is_ip6 = 0; is_ip6 < 2; is_ip6++)
+ {
+ u32 * r = clib_random_buffer_get_data (&vm->random_buffer, 3 * 2 * sizeof (r[0]));
+ tm->connection_hash_seeds[is_ip6][0].as_u32[0] = r[0];
+ tm->connection_hash_seeds[is_ip6][0].as_u32[1] = r[1];
+ tm->connection_hash_seeds[is_ip6][1].as_u32[0] = r[2];
+ tm->connection_hash_seeds[is_ip6][1].as_u32[1] = r[3];
+ tm->connection_hash_seeds[is_ip6][2].as_u32[0] = r[4];
+ tm->connection_hash_seeds[is_ip6][2].as_u32[1] = r[5];
+
+ ip46_tcp_lookup_init (vm, tm, is_ip6);
+ }
+
+ {
+ tcp_listener_t * l;
+
+ pool_get_aligned (tm->listener_pool, l, CLIB_CACHE_LINE_BYTES);
+
+ /* Null listener must always have zero index. */
+ ASSERT (l - tm->listener_pool == 0);
+
+ memset (l, 0, sizeof (l[0]));
+
+ /* No adjacencies are valid. */
+ l->valid_local_adjacency_bitmap = 0;
+
+ vec_validate_init_empty (tm->listener_index_by_dst_port,
+ (1 << 16) - 1,
+ l - tm->listener_pool);
+ }
+
+ /* Initialize disposition table. */
+ {
+ int i, j;
+ for (i = 0; i < ARRAY_LEN (tm->disposition_by_state_and_flags); i++)
+ for (j = 0; j < ARRAY_LEN (tm->disposition_by_state_and_flags[i]); j++)
+ {
+ tm->disposition_by_state_and_flags[i][j].next = TCP_LOOKUP_NEXT_DROP;
+ tm->disposition_by_state_and_flags[i][j].error = TCP_ERROR_LOOKUP_DROPS;
+ }
+
+#define _(t,f,n,e) \
+do { \
+ tm->disposition_by_state_and_flags[TCP_CONNECTION_STATE_##t][f].next = (n); \
+ tm->disposition_by_state_and_flags[TCP_CONNECTION_STATE_##t][f].error = (e); \
+} while (0)
+
+ /* SYNs for new connections -> tcp-listen. */
+ _ (unused, TCP_FLAG_SYN,
+ TCP_LOOKUP_NEXT_LISTEN_SYN, TCP_ERROR_NONE);
+ _ (listen_ack_wait, TCP_FLAG_ACK,
+ TCP_LOOKUP_NEXT_LISTEN_ACK, TCP_ERROR_NONE);
+ _ (established, TCP_FLAG_ACK,
+ TCP_LOOKUP_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+ _ (established, TCP_FLAG_FIN | TCP_FLAG_ACK,
+ TCP_LOOKUP_NEXT_ESTABLISHED, TCP_ERROR_NONE);
+
+#undef _
+ }
+
+ /* IP4 packet templates. */
+ {
+ ip4_tcp_syn_packet_t ip4_syn, ip4_syn_ack;
+ ip4_tcp_ack_packet_t ip4_ack, ip4_fin_ack, ip4_rst_ack;
+ ip6_tcp_syn_packet_t ip6_syn, ip6_syn_ack;
+ ip6_tcp_ack_packet_t ip6_ack, ip6_fin_ack, ip6_rst_ack;
+
+ memset (&ip4_syn, 0, sizeof (ip4_syn));
+ memset (&ip4_syn_ack, 0, sizeof (ip4_syn_ack));
+ memset (&ip4_ack, 0, sizeof (ip4_ack));
+ memset (&ip4_fin_ack, 0, sizeof (ip4_fin_ack));
+ memset (&ip4_rst_ack, 0, sizeof (ip4_rst_ack));
+ memset (&ip6_syn, 0, sizeof (ip6_syn));
+ memset (&ip6_syn_ack, 0, sizeof (ip6_syn_ack));
+ memset (&ip6_ack, 0, sizeof (ip6_ack));
+ memset (&ip6_fin_ack, 0, sizeof (ip6_fin_ack));
+ memset (&ip6_rst_ack, 0, sizeof (ip6_rst_ack));
+
+ ip4_tcp_packet_init (&ip4_syn.ip4, sizeof (ip4_syn));
+ ip4_tcp_packet_init (&ip4_syn_ack.ip4, sizeof (ip4_syn_ack));
+ ip4_tcp_packet_init (&ip4_ack.ip4, sizeof (ip4_ack));
+ ip4_tcp_packet_init (&ip4_fin_ack.ip4, sizeof (ip4_fin_ack));
+ ip4_tcp_packet_init (&ip4_rst_ack.ip4, sizeof (ip4_rst_ack));
+
+ ip6_tcp_packet_init (&ip6_syn.ip6, sizeof (ip6_syn));
+ ip6_tcp_packet_init (&ip6_syn_ack.ip6, sizeof (ip6_syn_ack));
+ ip6_tcp_packet_init (&ip6_ack.ip6, sizeof (ip6_ack));
+ ip6_tcp_packet_init (&ip6_fin_ack.ip6, sizeof (ip6_fin_ack));
+ ip6_tcp_packet_init (&ip6_rst_ack.ip6, sizeof (ip6_rst_ack));
+
+ /* TCP header. */
+ {
+ u8 window_scale = 7;
+ tcp_syn_packet_t * s = &ip4_syn.tcp;
+ tcp_syn_packet_t * sa = &ip4_syn_ack.tcp;
+ tcp_ack_packet_t * a = &ip4_ack.tcp;
+ tcp_ack_packet_t * fa = &ip4_fin_ack.tcp;
+ tcp_ack_packet_t * ra = &ip4_rst_ack.tcp;
+
+ s->header.tcp_header_u32s_and_reserved = (sizeof (s[0]) / sizeof (u32)) << 4;
+ a->header.tcp_header_u32s_and_reserved = (sizeof (a[0]) / sizeof (u32)) << 4;
+
+ s->header.flags = TCP_FLAG_SYN;
+ a->header.flags = TCP_FLAG_ACK;
+
+ s->header.window = clib_host_to_net_u16 (32 << (10 - window_scale));
+ a->header.window = s->header.window;
+
+ s->options.mss.type = TCP_OPTION_MSS;
+ s->options.mss.length = 4;
+
+ s->options.window_scale.type = TCP_OPTION_WINDOW_SCALE;
+ s->options.window_scale.length = 3;
+ s->options.window_scale.value = window_scale;
+
+ s->options.time_stamp.type = TCP_OPTION_TIME_STAMP;
+ s->options.time_stamp.length = 10;
+
+ memset (&s->options.nops, TCP_OPTION_NOP, sizeof (s->options.nops));
+
+ /* SYN-ACK is same as SYN but with ACK flag set. */
+ sa[0] = s[0];
+ sa->header.flags |= TCP_FLAG_ACK;
+
+ a->options.time_stamp.type = TCP_OPTION_TIME_STAMP;
+ a->options.time_stamp.length = 10;
+ memset (&a->options.nops, TCP_OPTION_NOP, sizeof (a->options.nops));
+
+ /* {FIN,RST}-ACK are same as ACK but with {FIN,RST} flag set. */
+ fa[0] = a[0];
+ fa->header.flags |= TCP_FLAG_FIN;
+ ra[0] = a[0];
+ ra->header.flags |= TCP_FLAG_RST;
+
+ /* IP6 TCP headers are identical. */
+ ip6_syn.tcp = s[0];
+ ip6_syn_ack.tcp = sa[0];
+ ip6_ack.tcp = a[0];
+ ip6_fin_ack.tcp = fa[0];
+ ip6_rst_ack.tcp = ra[0];
+
+ /* TCP checksums. */
+ {
+ ip_csum_t sum;
+
+ sum = clib_host_to_net_u32 (sizeof (ip4_ack.tcp) + (ip4_ack.ip4.protocol << 16));
+ sum = ip_incremental_checksum (sum, &ip4_ack.tcp, sizeof (ip4_ack.tcp));
+ ip4_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip4_fin_ack.tcp) + (ip4_fin_ack.ip4.protocol << 16));
+ sum = ip_incremental_checksum (sum, &ip4_fin_ack.tcp, sizeof (ip4_fin_ack.tcp));
+ ip4_fin_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip4_rst_ack.tcp) + (ip4_rst_ack.ip4.protocol << 16));
+ sum = ip_incremental_checksum (sum, &ip4_rst_ack.tcp, sizeof (ip4_rst_ack.tcp));
+ ip4_rst_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip4_syn.tcp) + (ip4_syn.ip4.protocol << 16));
+ sum = ip_incremental_checksum (sum, &ip4_syn.tcp, sizeof (ip4_syn.tcp));
+ ip4_syn.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip4_syn_ack.tcp) + (ip4_syn_ack.ip4.protocol << 16));
+ sum = ip_incremental_checksum (sum, &ip4_syn_ack.tcp, sizeof (ip4_syn_ack.tcp));
+ ip4_syn_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip6_ack.tcp)) + ip6_ack.ip6.protocol;
+ sum = ip_incremental_checksum (sum, &ip6_ack.tcp, sizeof (ip6_ack.tcp));
+ ip6_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip6_fin_ack.tcp)) + ip6_fin_ack.ip6.protocol;
+ sum = ip_incremental_checksum (sum, &ip6_fin_ack.tcp, sizeof (ip6_fin_ack.tcp));
+ ip6_fin_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip6_rst_ack.tcp)) + ip6_rst_ack.ip6.protocol;
+ sum = ip_incremental_checksum (sum, &ip6_rst_ack.tcp, sizeof (ip6_rst_ack.tcp));
+ ip6_rst_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip6_syn.tcp)) + ip6_syn.ip6.protocol;
+ sum = ip_incremental_checksum (sum, &ip6_syn.tcp, sizeof (ip6_syn.tcp));
+ ip6_syn.tcp.header.checksum = ~ ip_csum_fold (sum);
+
+ sum = clib_host_to_net_u32 (sizeof (ip6_syn_ack.tcp)) + ip6_syn_ack.ip6.protocol;
+ sum = ip_incremental_checksum (sum, &ip6_syn_ack.tcp, sizeof (ip6_syn_ack.tcp));
+ ip6_syn_ack.tcp.header.checksum = ~ ip_csum_fold (sum);
+ }
+ }
+
+#define _(t,x,n) \
+do { \
+ vlib_packet_template_init \
+ (vm, \
+ &tm->ip4.packet_templates[t].vlib, \
+ &x, sizeof (x), \
+ /* alloc chunk size */ VLIB_FRAME_SIZE, \
+ (n)); \
+ tm->ip4.packet_templates[t].tcp_checksum_net_byte_order \
+ = x.tcp.header.checksum; \
+ tm->ip4.packet_templates[t].ip4_checksum_net_byte_order \
+ = x.ip4.checksum; \
+} while (0)
+
+ _ (TCP_PACKET_TEMPLATE_SYN, ip4_syn, "ip4 tcp syn");
+ _ (TCP_PACKET_TEMPLATE_SYN_ACK, ip4_syn_ack, "ip4 tcp syn-ack");
+ _ (TCP_PACKET_TEMPLATE_ACK, ip4_ack, "ip4 tcp ack");
+ _ (TCP_PACKET_TEMPLATE_FIN_ACK, ip4_fin_ack, "ip4 tcp fin-ack");
+ _ (TCP_PACKET_TEMPLATE_RST_ACK, ip4_rst_ack, "ip4 tcp rst-ack");
+
+#undef _
+
+#define _(t,x,n) \
+do { \
+ vlib_packet_template_init \
+ (vm, \
+ &tm->ip6.packet_templates[t].vlib, \
+ &x, sizeof (x), \
+ /* alloc chunk size */ VLIB_FRAME_SIZE, \
+ (n)); \
+ tm->ip6.packet_templates[t].tcp_checksum_net_byte_order \
+ = x.tcp.header.checksum; \
+ tm->ip6.packet_templates[t].ip4_checksum_net_byte_order \
+ = 0xdead; \
+} while (0)
+
+ _ (TCP_PACKET_TEMPLATE_SYN, ip6_syn, "ip6 tcp syn");
+ _ (TCP_PACKET_TEMPLATE_SYN_ACK, ip6_syn_ack, "ip6 tcp syn-ack");
+ _ (TCP_PACKET_TEMPLATE_ACK, ip6_ack, "ip6 tcp ack");
+ _ (TCP_PACKET_TEMPLATE_FIN_ACK, ip6_fin_ack, "ip6 tcp fin-ack");
+ _ (TCP_PACKET_TEMPLATE_RST_ACK, ip6_rst_ack, "ip6 tcp rst-ack");
+
+#undef _
+ }
+}
+
+static char * tcp_error_strings[] = {
+#define _(sym,string) string,
+ foreach_tcp_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (ip4_tcp_lookup_node,static) = {
+ .function = ip4_tcp_lookup,
+ .name = "ip4-tcp-lookup",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [TCP_LOOKUP_NEXT_DROP] = "error-drop",
+ [TCP_LOOKUP_NEXT_PUNT] = "error-punt",
+ [TCP_LOOKUP_NEXT_LISTEN_SYN] = "ip4-tcp-listen",
+ [TCP_LOOKUP_NEXT_LISTEN_ACK] = "ip4-tcp-establish",
+ [TCP_LOOKUP_NEXT_CONNECT_SYN_ACK] = "ip4-tcp-connect",
+ [TCP_LOOKUP_NEXT_ESTABLISHED] = "ip4-tcp-established",
+ },
+
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_lookup_node,static) = {
+ .function = ip6_tcp_lookup,
+ .name = "ip6-tcp-lookup",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_LOOKUP_N_NEXT,
+ .next_nodes = {
+ [TCP_LOOKUP_NEXT_DROP] = "error-drop",
+ [TCP_LOOKUP_NEXT_PUNT] = "error-punt",
+ [TCP_LOOKUP_NEXT_LISTEN_SYN] = "ip6-tcp-listen",
+ [TCP_LOOKUP_NEXT_LISTEN_ACK] = "ip4-tcp-establish",
+ [TCP_LOOKUP_NEXT_CONNECT_SYN_ACK] = "ip6-tcp-connect",
+ [TCP_LOOKUP_NEXT_ESTABLISHED] = "ip6-tcp-established",
+ },
+
+ .n_errors = TCP_N_ERROR,
+ .error_strings = tcp_error_strings,
+};
+
+static_always_inline void
+tcp_options_decode_for_syn (tcp_main_t * tm, tcp_mini_connection_t * m, tcp_header_t * tcp)
+{
+ u8 * o = (void *) (tcp + 1);
+ u32 n_bytes = (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32);
+ u8 * e = o + n_bytes;
+ tcp_mini_connection_t * tmpl = &tm->option_decode_mini_connection_template;
+ tcp_option_type_t t;
+ u8 i, l, * p;
+ u8 * option_decode[16];
+
+ /* Initialize defaults. */
+ option_decode[TCP_OPTION_MSS] = (u8 *) &tmpl->max_segment_size;
+ option_decode[TCP_OPTION_WINDOW_SCALE] = (u8 *) &tmpl->window_scale;
+ option_decode[TCP_OPTION_TIME_STAMP] = (u8 *) &tmpl->time_stamps.his_net_byte_order;
+
+ if (n_bytes > 0)
+ {
+#define _ \
+do { \
+ t = o[0]; \
+ i = t >= ARRAY_LEN (option_decode) ? TCP_OPTION_END : t; \
+ option_decode[i] = o + 2; \
+ /* Skip nop; don't skip end; else length from packet. */ \
+ l = t < 2 ? t : o[1]; \
+ p = o + l; \
+ o = p < e ? p : o; \
+} while (0)
+
+ _; _; _;
+ /* Fast path: NOP NOP TIMESTAMP. */
+ if (o >= e) goto done;
+ _; _;
+ if (o >= e) goto done;
+ _; _; _;
+
+#undef _
+
+ done:;
+ }
+
+ m->max_segment_size =
+ clib_net_to_host_u16 (*(u16 *) option_decode[TCP_OPTION_MSS]);
+ m->window_scale = *option_decode[TCP_OPTION_WINDOW_SCALE];
+ m->time_stamps.his_net_byte_order = ((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[0];
+}
+
+static_always_inline u32
+tcp_options_decode_for_ack (tcp_main_t * tm, tcp_header_t * tcp,
+ u32 * his_time_stamp)
+{
+ u8 * o = (void *) (tcp + 1);
+ u32 n_bytes = (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32);
+ u8 * e = o + n_bytes;
+ tcp_option_type_t t;
+ u8 i, l, * p;
+ u8 * option_decode[16];
+ u32 default_time_stamps[2];
+
+ /* Initialize defaults. */
+ default_time_stamps[0] = default_time_stamps[1] = 0;
+ option_decode[TCP_OPTION_TIME_STAMP] = (u8 *) &default_time_stamps;
+
+ if (n_bytes > 0)
+ {
+#define _ \
+do { \
+ t = o[0]; \
+ i = t >= ARRAY_LEN (option_decode) ? TCP_OPTION_END : t; \
+ option_decode[i] = o + 2; \
+ /* Skip nop; don't skip end; else length from packet. */ \
+ l = t < 2 ? t : o[1]; \
+ p = o + l; \
+ o = p < e ? p : o; \
+} while (0)
+
+ _; _; _;
+ /* Fast path: NOP NOP TIMESTAMP. */
+ if (o >= e) goto done;
+ _; _;
+ if (o >= e) goto done;
+ _; _; _;
+#undef _
+
+ done:;
+ }
+
+ if (his_time_stamp)
+ his_time_stamp[0] = ((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[0];
+
+ return clib_net_to_host_u32 (((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[1]);
+}
+
+static void
+tcp_options_decode_init (tcp_main_t * tm)
+{
+ tcp_mini_connection_t * m = &tm->option_decode_mini_connection_template;
+
+ memset (m, 0, sizeof (m[0]));
+ m->max_segment_size = clib_host_to_net_u16 (576 - 40);
+ m->window_scale = 0;
+ m->time_stamps.his_net_byte_order = 0;
+}
+
+/* Initialize target buffer as "related" to given buffer. */
+always_inline void
+vlib_buffer_copy_shared_fields (vlib_main_t * vm, vlib_buffer_t * b, u32 bi_target)
+{
+ vlib_buffer_t * b_target = vlib_get_buffer (vm, bi_target);
+ vnet_buffer (b_target)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_RX];
+ b_target->trace_index = b->trace_index;
+ b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED;
+}
+
+typedef enum {
+ TCP_LISTEN_NEXT_DROP,
+ TCP_LISTEN_NEXT_REPLY,
+ TCP_LISTEN_N_NEXT,
+} tcp_listen_next_t;
+
+static_always_inline uword
+ip46_tcp_listen (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_reply, * to_drop, * random_ack_numbers;
+ u32 n_left_from, n_left_to_reply, n_left_to_drop, mini_now, timestamp_now;
+ u16 * fid, * fragment_ids;
+ vlib_node_runtime_t * error_node;
+
+ error_node = vlib_node_get_runtime
+ (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ mini_now = tcp_time_now (tm, TCP_TIMER_mini_connection);
+ timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp);
+
+ random_ack_numbers = clib_random_buffer_get_data (&vm->random_buffer,
+ n_packets * sizeof (random_ack_numbers[0]));
+ /* Get random fragment IDs for replies. */
+ fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer,
+ n_packets * sizeof (fragment_ids[0]));
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, TCP_LISTEN_NEXT_REPLY,
+ to_reply, n_left_to_reply);
+ vlib_get_next_frame (vm, node, TCP_LISTEN_NEXT_DROP,
+ to_drop, n_left_to_drop);
+
+ while (n_left_from > 0 && n_left_to_reply > 0 && n_left_to_drop > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip60;
+ ip4_header_t * ip40;
+ tcp_header_t * tcp0;
+ tcp_mini_connection_t * min0;
+ tcp_syn_packet_t * tcp_reply0;
+ ip_csum_t tcp_sum0;
+ u32 bi0, bi_reply0, imin0, my_seq_net0, his_seq_host0, his_seq_net0;
+ u8 i0;
+
+ bi0 = to_drop[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_drop += 1;
+ n_left_to_drop -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+
+ p0->error = error_node->errors[TCP_ERROR_LISTEN_RESPONSES];
+
+ imin0 = vnet_buffer (p0)->ip.tcp.mini_connection_index;
+ i0 = imin0 % 4;
+
+ if (is_ip6)
+ {
+ ip6_tcp_udp_address_x4_and_timestamps_t * mina0;
+
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+
+ mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0 / 4);
+
+ ip6_tcp_udp_address_x4_set_from_headers (&mina0->address_x4,
+ ip60, tcp0, i0);
+ mina0->time_stamps[i0] = mini_now;
+ }
+ else
+ {
+ ip4_tcp_udp_address_x4_and_timestamps_t * mina0;
+
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+
+ mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0 / 4);
+
+ ip4_tcp_udp_address_x4_set_from_headers (&mina0->address_x4,
+ ip40, tcp0, i0);
+ mina0->time_stamps[i0] = mini_now;
+ }
+
+ min0 = vec_elt_at_index (tm46->mini_connections, imin0);
+
+ min0->state = TCP_CONNECTION_STATE_listen_ack_wait;
+ min0->time_stamps.ours_host_byte_order = timestamp_now;
+ tcp_options_decode_for_syn (tm, min0, tcp0);
+
+ my_seq_net0 = *random_ack_numbers++;
+ his_seq_host0 = 1 + clib_net_to_host_u32 (tcp0->seq_number);
+
+ min0->sequence_numbers.ours = 1 + clib_net_to_host_u32 (my_seq_net0);
+ min0->sequence_numbers.his = his_seq_host0;
+
+ if (is_ip6)
+ {
+ ip6_tcp_syn_packet_t * r0;
+ uword tmp0, i;
+
+ r0 = vlib_packet_template_get_packet
+ (vm,
+ &tm->ip6.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK].vlib,
+ &bi_reply0);
+ tcp_reply0 = &r0->tcp;
+
+ tcp_sum0 = (tm->ip6.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK]
+ .tcp_checksum_net_byte_order);
+
+ for (i = 0; i < ARRAY_LEN (ip60->dst_address.as_uword); i++)
+ {
+ tmp0 = r0->ip6.src_address.as_uword[i] = ip60->dst_address.as_uword[i];
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0);
+
+ tmp0 = r0->ip6.dst_address.as_uword[i] = ip60->src_address.as_uword[i];
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0);
+ }
+ }
+ else
+ {
+ ip4_tcp_syn_packet_t * r0;
+ ip_csum_t ip_sum0;
+ u32 src0, dst0;
+
+ r0 = vlib_packet_template_get_packet
+ (vm,
+ &tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK].vlib,
+ &bi_reply0);
+ tcp_reply0 = &r0->tcp;
+
+ tcp_sum0 = (tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK]
+ .tcp_checksum_net_byte_order);
+ ip_sum0 = (tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK]
+ .ip4_checksum_net_byte_order);
+
+ src0 = r0->ip4.src_address.as_u32 = ip40->dst_address.as_u32;
+ dst0 = r0->ip4.dst_address.as_u32 = ip40->src_address.as_u32;
+
+ ip_sum0 = ip_csum_add_even (ip_sum0, src0);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, src0);
+
+ ip_sum0 = ip_csum_add_even (ip_sum0, dst0);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, dst0);
+
+ r0->ip4.checksum = ip_csum_fold (ip_sum0);
+
+ ASSERT (r0->ip4.checksum == ip4_header_checksum (&r0->ip4));
+ }
+
+ tcp_reply0->header.ports.src = tcp0->ports.dst;
+ tcp_reply0->header.ports.dst = tcp0->ports.src;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->header.ports.src_and_dst);
+
+ tcp_reply0->header.seq_number = my_seq_net0;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, my_seq_net0);
+
+ his_seq_net0 = clib_host_to_net_u32 (his_seq_host0);
+ tcp_reply0->header.ack_number = his_seq_net0;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, his_seq_net0);
+
+ {
+ ip_adjacency_t * adj0 = ip_get_adjacency (&ip4_main.lookup_main, vnet_buffer (p0)->ip.adj_index[VLIB_RX]);
+ u16 my_mss =
+ (adj0->rewrite_header.max_l3_packet_bytes
+ - (is_ip6 ? sizeof (ip60[0]) : sizeof (ip40[0]))
+ - sizeof (tcp0[0]));
+
+ my_mss = clib_min (my_mss, min0->max_segment_size);
+ min0->max_segment_size = my_mss;
+
+ tcp_reply0->options.mss.value = clib_host_to_net_u16 (my_mss);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.mss.value);
+ }
+
+ tcp_reply0->options.time_stamp.my_time_stamp = clib_host_to_net_u32 (timestamp_now);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.time_stamp.my_time_stamp);
+
+ tcp_reply0->options.time_stamp.his_time_stamp = min0->time_stamps.his_net_byte_order;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.time_stamp.his_time_stamp);
+
+ tcp_reply0->header.checksum = ip_csum_fold (tcp_sum0);
+
+ vlib_buffer_copy_shared_fields (vm, p0, bi_reply0);
+
+ to_reply[0] = bi_reply0;
+ n_left_to_reply -= 1;
+ to_reply += 1;
+ }
+
+ vlib_put_next_frame (vm, node, TCP_LISTEN_NEXT_REPLY, n_left_to_reply);
+ vlib_put_next_frame (vm, node, TCP_LISTEN_NEXT_DROP, n_left_to_drop);
+ }
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ /* FIXME */ ;
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_tcp_listen (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_listen (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_listen (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_listen (vm, node, frame, /* is_ip6 */ 1); }
+
+VLIB_REGISTER_NODE (ip4_tcp_listen_node,static) = {
+ .function = ip4_tcp_listen,
+ .name = "ip4-tcp-listen",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_LISTEN_N_NEXT,
+ .next_nodes = {
+ [TCP_LISTEN_NEXT_DROP] = "error-drop",
+ [TCP_LISTEN_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_listen_node,static) = {
+ .function = ip6_tcp_listen,
+ .name = "ip6-tcp-listen",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_LISTEN_N_NEXT,
+ .next_nodes = {
+ [TCP_LISTEN_NEXT_DROP] = "error-drop",
+ [TCP_LISTEN_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup",
+ },
+};
+
+typedef enum {
+ TCP_CONNECT_NEXT_DROP,
+ TCP_CONNECT_NEXT_REPLY,
+ TCP_CONNECT_N_NEXT,
+} tcp_connect_next_t;
+
+static_always_inline uword
+ip46_tcp_connect (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next;
+ vlib_node_runtime_t * error_node;
+
+ /* FIXME */
+ clib_warning ("%p", tm46);
+
+ error_node = vlib_node_get_runtime
+ (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip60;
+ ip4_header_t * ip40;
+ tcp_header_t * tcp0;
+ u32 bi0;
+ u8 error0, next0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+
+ if (is_ip6)
+ {
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+ }
+ else
+ {
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+ }
+
+ ASSERT (0);
+
+ error0 = next0 = 0;
+ p0->error = error_node->errors[error0];
+
+ if (PREDICT_FALSE (next0 != next))
+ {
+ to_next -= 1;
+ n_left_to_next += 1;
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+
+ next = next0;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ /* FIXME */ ;
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_tcp_connect (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_connect (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_connect (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_connect (vm, node, frame, /* is_ip6 */ 1); }
+
+VLIB_REGISTER_NODE (ip4_tcp_connect_node,static) = {
+ .function = ip4_tcp_connect,
+ .name = "ip4-tcp-connect",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_CONNECT_N_NEXT,
+ .next_nodes = {
+ [TCP_CONNECT_NEXT_DROP] = "error-drop",
+ [TCP_CONNECT_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_connect_node,static) = {
+ .function = ip6_tcp_connect,
+ .name = "ip6-tcp-connect",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_CONNECT_N_NEXT,
+ .next_nodes = {
+ [TCP_CONNECT_NEXT_DROP] = "error-drop",
+ [TCP_CONNECT_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup",
+ },
+};
+
+typedef enum {
+ TCP_ESTABLISH_NEXT_DROP,
+ TCP_ESTABLISH_NEXT_ESTABLISHED,
+ TCP_ESTABLISH_N_NEXT,
+} tcp_establish_next_t;
+
+static_always_inline uword
+ip46_tcp_establish (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ uword is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next, mini_long_long_ago, timestamp_now;
+ vlib_node_runtime_t * error_node;
+
+ error_node = vlib_node_get_runtime
+ (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+ mini_long_long_ago =
+ (tcp_time_now (tm, TCP_TIMER_mini_connection)
+ + (1 << (BITS (mini_long_long_ago) - 1)));
+ timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip60;
+ ip4_header_t * ip40;
+ tcp_header_t * tcp0;
+ tcp_mini_connection_t * min0;
+ tcp_connection_t * est0;
+ tcp_listener_t * l0;
+ u32 bi0, imin0, iest0;
+ u8 error0, next0, i0, e0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+
+ imin0 = vnet_buffer (p0)->ip.tcp.mini_connection_index;
+ iest0 = vnet_buffer (p0)->ip.tcp.established_connection_index;
+
+ i0 = imin0 % 4;
+ e0 = iest0 % 4;
+
+ min0 = vec_elt_at_index (tm46->mini_connections, imin0);
+ if (PREDICT_FALSE (min0->state == TCP_CONNECTION_STATE_unused))
+ goto already_established0;
+ min0->state = TCP_CONNECTION_STATE_unused;
+
+ if (is_ip6)
+ {
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+ }
+ else
+ {
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+ }
+
+ if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->seq_number)
+ != min0->sequence_numbers.his))
+ goto unexpected_seq_number0;
+ if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->ack_number)
+ != min0->sequence_numbers.ours))
+ goto unexpected_ack_number0;
+
+ if (is_ip6)
+ {
+ ip6_tcp_udp_address_x4_and_timestamps_t * mina0;
+ ip6_tcp_udp_address_x4_t * esta0;
+
+ mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0 / 4);
+ esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest0 / 4);
+
+ ip6_tcp_udp_address_x4_copy_and_invalidate (esta0, &mina0->address_x4, e0, i0);
+
+ mina0->time_stamps[i0] = mini_long_long_ago;
+ }
+ else
+ {
+ ip4_tcp_udp_address_x4_and_timestamps_t * mina0;
+ ip4_tcp_udp_address_x4_t * esta0;
+
+ mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0 / 4);
+ esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest0 / 4);
+
+ ip4_tcp_udp_address_x4_copy_and_invalidate (esta0, &mina0->address_x4, e0, i0);
+
+ mina0->time_stamps[i0] = mini_long_long_ago;
+ }
+
+ est0 = vec_elt_at_index (tm46->established_connections, iest0);
+
+ est0->sequence_numbers = min0->sequence_numbers;
+ est0->max_segment_size = (min0->max_segment_size
+ - STRUCT_SIZE_OF (tcp_ack_packet_t, options));
+ est0->his_window_scale = min0->window_scale;
+ est0->his_window = clib_net_to_host_u16 (tcp0->window);
+ est0->time_stamps.ours_host_byte_order = min0->time_stamps.ours_host_byte_order;
+
+ /* Compute first measurement of round trip time. */
+ {
+ u32 t = tcp_options_decode_for_ack (tm, tcp0, &est0->time_stamps.his_net_byte_order);
+ f64 dt = (timestamp_now - t) * tm->secs_per_tick[TCP_TIMER_timestamp];
+ est0->round_trip_time_stats.sum = dt;
+ est0->round_trip_time_stats.sum2 = dt*dt;
+ est0->round_trip_time_stats.count = 1;
+
+ {
+ ELOG_TYPE_DECLARE (e) = {
+ .format = "establish ack rtt: %.4e",
+ .format_args = "f8",
+ };
+ struct { f64 dt; } * ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->dt = dt;
+ }
+ }
+
+ est0->my_window_scale = 7;
+ est0->my_window = 256;
+
+ l0 = pool_elt_at_index (tm->listener_pool, vnet_buffer (p0)->ip.tcp.listener_index);
+ vec_add1 (l0->event_connections[is_ip6], tcp_connection_handle_set (iest0, is_ip6));
+
+ next0 = TCP_ESTABLISH_NEXT_DROP;
+ error0 = TCP_ERROR_LISTENS_ESTABLISHED;
+
+ enqueue0:
+ p0->error = error_node->errors[error0];
+ if (PREDICT_FALSE (next0 != next))
+ {
+ to_next -= 1;
+ n_left_to_next += 1;
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+
+ next = next0;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ continue;
+
+ already_established0:
+ next0 = TCP_ESTABLISH_NEXT_ESTABLISHED;
+ error0 = TCP_ERROR_NONE;
+ goto enqueue0;
+
+ unexpected_seq_number0:
+ next0 = TCP_ESTABLISH_NEXT_DROP;
+ error0 = TCP_ERROR_UNEXPECTED_SEQ_NUMBER;
+ goto enqueue0;
+
+ unexpected_ack_number0:
+ next0 = TCP_ESTABLISH_NEXT_DROP;
+ error0 = TCP_ERROR_UNEXPECTED_ACK_NUMBER;
+ goto enqueue0;
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ /* FIXME */ ;
+
+ /* Inform listeners of new connections. */
+ {
+ tcp_listener_t * l;
+ uword n;
+ pool_foreach (l, tm->listener_pool, ({
+ if ((n = vec_len (l->event_connections[is_ip6])) > 0)
+ {
+ if (l->event_function)
+ l->event_function (l->event_connections[is_ip6],
+ TCP_EVENT_connection_established);
+ if (tm->n_established_connections[is_ip6] == 0)
+ vlib_node_set_state (vm, tm46->output_node_index, VLIB_NODE_STATE_POLLING);
+ tm->n_established_connections[is_ip6] += n;
+ _vec_len (l->event_connections[is_ip6]) = 0;
+ }
+ }));
+ }
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_tcp_establish (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_establish (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_establish (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_establish (vm, node, frame, /* is_ip6 */ 1); }
+
+VLIB_REGISTER_NODE (ip4_tcp_establish_node,static) = {
+ .function = ip4_tcp_establish,
+ .name = "ip4-tcp-establish",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_ESTABLISH_N_NEXT,
+ .next_nodes = {
+ [TCP_ESTABLISH_NEXT_DROP] = "error-drop",
+ [TCP_ESTABLISH_NEXT_ESTABLISHED] = "ip4-tcp-established",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_establish_node,static) = {
+ .function = ip6_tcp_establish,
+ .name = "ip6-tcp-establish",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_ESTABLISH_N_NEXT,
+ .next_nodes = {
+ [TCP_ESTABLISH_NEXT_DROP] = "error-drop",
+ [TCP_ESTABLISH_NEXT_ESTABLISHED] = "ip6-tcp-established",
+ },
+};
+
+static_always_inline void
+tcp_free_connection_x1 (vlib_main_t * vm, tcp_main_t * tm,
+ tcp_ip_4_or_6_t is_ip6,
+ u32 iest0)
+{
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ tcp_connection_t * est0;
+ u32 iest_div0, iest_mod0;
+
+ iest_div0 = iest0 / 4;
+ iest_mod0 = iest0 % 4;
+
+ if (is_ip6)
+ {
+ ip6_tcp_udp_address_x4_t * esta0;
+ esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div0);
+ ip6_tcp_udp_address_x4_invalidate (esta0, iest_mod0);
+ }
+ else
+ {
+ ip4_tcp_udp_address_x4_t * esta0;
+ esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div0);
+ ip4_tcp_udp_address_x4_invalidate (esta0, iest_mod0);
+ }
+
+ est0 = vec_elt_at_index (tm46->established_connections, iest0);
+}
+
+static_always_inline void
+tcp_free_connection_x2 (vlib_main_t * vm, tcp_main_t * tm,
+ tcp_ip_4_or_6_t is_ip6,
+ u32 iest0, u32 iest1)
+{
+ tcp_free_connection_x1 (vm, tm, is_ip6, iest0);
+ tcp_free_connection_x1 (vm, tm, is_ip6, iest1);
+}
+
+static_always_inline uword
+ip46_tcp_output (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ tcp_ip_4_or_6_t is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ u32 * cis, * to_next, n_left_to_next, n_connections_left;
+ u32 timestamp_now_host_byte_order, timestamp_now_net_byte_order;
+ vlib_node_runtime_t * error_node;
+ const u32 next = 0;
+ uword n_acks;
+
+ /* Inform listeners of new connections. */
+ {
+ tcp_listener_t * l;
+ pool_foreach (l, tm->listener_pool, ({
+ if (vec_len (l->eof_connections) > 0)
+ {
+ if (l->event_function)
+ l->event_function (l->eof_connections[is_ip6], TCP_EVENT_fin_received);
+ else
+ {
+ uword i;
+ for (i = 0; i < vec_len (l->eof_connections[is_ip6]); i++)
+ {
+ tcp_connection_t * c = tcp_get_connection (l->eof_connections[is_ip6][i]);
+ c->flags |= TCP_CONNECTION_FLAG_application_requested_close;
+ }
+ }
+ _vec_len (l->eof_connections[is_ip6]) = 0;
+ }
+
+ if (vec_len (l->close_connections[is_ip6]) > 0)
+ {
+ uword n_left;
+ u32 * cis;
+
+ if (l->event_function)
+ l->event_function (l->close_connections[is_ip6], TCP_EVENT_connection_closed);
+
+ cis = l->close_connections[is_ip6];
+ n_left = vec_len (cis);
+ ASSERT (tm->n_established_connections[is_ip6] >= n_left);
+ tm->n_established_connections[is_ip6] -= n_left;
+ if (tm->n_established_connections[is_ip6] == 0)
+ vlib_node_set_state (vm, tm46->output_node_index, VLIB_NODE_STATE_DISABLED);
+ while (n_left >= 2)
+ {
+ tcp_free_connection_x2 (vm, tm, is_ip6, cis[0], cis[1]);
+ n_left -= 2;
+ cis += 2;
+ }
+
+ while (n_left > 0)
+ {
+ tcp_free_connection_x1 (vm, tm, is_ip6, cis[0]);
+ n_left -= 1;
+ cis += 1;
+ }
+
+ _vec_len (l->close_connections[is_ip6]) = 0;
+ }
+ }));
+ }
+
+ n_acks = 0;
+ cis = tm46->connections_pending_acks;
+ n_connections_left = vec_len (cis);
+ if (n_connections_left == 0)
+ return n_acks;
+ _vec_len (tm46->connections_pending_acks) = 0;
+ error_node = vlib_node_get_runtime
+ (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index);
+
+ timestamp_now_host_byte_order = tcp_time_now (tm, TCP_TIMER_timestamp);
+ timestamp_now_net_byte_order = clib_host_to_net_u32 (timestamp_now_host_byte_order);
+
+ while (n_connections_left > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_connections_left > 0 && n_left_to_next > 0)
+ {
+ tcp_connection_t * est0;
+ tcp_ack_packet_t * tcp0;
+ tcp_udp_ports_t * ports0;
+ ip_csum_t tcp_sum0;
+ tcp_packet_template_type_t template_type0;
+ u32 bi0, iest0, iest_div0, iest_mod0, my_seq_net0, his_seq_net0;
+ u8 is_fin0;
+
+ iest0 = cis[0];
+ cis += 1;
+ iest_div0 = iest0 / 4;
+ iest_mod0 = iest0 % 4;
+ est0 = vec_elt_at_index (tm46->established_connections, iest0);
+
+ /* Send a FIN along with our ACK if application closed connection. */
+ {
+ u8 is_closed0, fin_sent0;
+
+ is_closed0 = (est0->flags & TCP_CONNECTION_FLAG_application_requested_close) != 0;
+ fin_sent0 = (est0->flags & TCP_CONNECTION_FLAG_fin_sent) != 0;
+
+ is_fin0 = is_closed0 && ! fin_sent0;
+ template_type0 =
+ (is_fin0
+ ? TCP_PACKET_TEMPLATE_FIN_ACK
+ : TCP_PACKET_TEMPLATE_ACK);
+ est0->flags |= is_closed0 << LOG2_TCP_CONNECTION_FLAG_fin_sent;
+ }
+
+ if (is_ip6)
+ {
+ ip6_tcp_ack_packet_t * r0;
+ ip6_tcp_udp_address_x4_t * esta0;
+ uword tmp0, i;
+
+ esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div0);
+ r0 = vlib_packet_template_get_packet
+ (vm, &tm->ip6.packet_templates[template_type0].vlib, &bi0);
+ tcp0 = &r0->tcp;
+
+ tcp_sum0 = (tm->ip6.packet_templates[template_type0]
+ .tcp_checksum_net_byte_order);
+
+ for (i = 0; i < ARRAY_LEN (r0->ip6.src_address.as_u32); i++)
+ {
+ tmp0 = r0->ip6.src_address.as_u32[i] = esta0->dst.as_u32[i][iest_mod0];
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0);
+
+ tmp0 = r0->ip6.dst_address.as_u32[i] = esta0->src.as_u32[i][iest_mod0];
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0);
+ }
+
+ ports0 = &esta0->ports.as_ports[iest_mod0];
+ }
+ else
+ {
+ ip4_tcp_ack_packet_t * r0;
+ ip4_tcp_udp_address_x4_t * esta0;
+ ip_csum_t ip_sum0;
+ u32 src0, dst0;
+
+ esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div0);
+ r0 = vlib_packet_template_get_packet
+ (vm, &tm->ip4.packet_templates[template_type0].vlib, &bi0);
+ tcp0 = &r0->tcp;
+
+ ip_sum0 = (tm->ip4.packet_templates[template_type0]
+ .ip4_checksum_net_byte_order);
+ tcp_sum0 = (tm->ip4.packet_templates[template_type0]
+ .tcp_checksum_net_byte_order);
+
+ src0 = r0->ip4.src_address.as_u32 = esta0->dst.as_ip4_address[iest_mod0].as_u32;
+ dst0 = r0->ip4.dst_address.as_u32 = esta0->src.as_ip4_address[iest_mod0].as_u32;
+
+ ip_sum0 = ip_csum_add_even (ip_sum0, src0);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, src0);
+
+ ip_sum0 = ip_csum_add_even (ip_sum0, dst0);
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, dst0);
+
+ r0->ip4.checksum = ip_csum_fold (ip_sum0);
+
+ ASSERT (r0->ip4.checksum == ip4_header_checksum (&r0->ip4));
+ ports0 = &esta0->ports.as_ports[iest_mod0];
+ }
+
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, ports0->as_u32);
+ tcp0->header.ports.src = ports0->dst;
+ tcp0->header.ports.dst = ports0->src;
+
+ my_seq_net0 = clib_host_to_net_u32 (est0->sequence_numbers.ours);
+ his_seq_net0 = clib_host_to_net_u32 (est0->sequence_numbers.his);
+
+ /* FIN accounts for 1 sequence number. */
+ est0->sequence_numbers.ours += is_fin0;
+
+ tcp0->header.seq_number = my_seq_net0;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, my_seq_net0);
+
+ tcp0->header.ack_number = his_seq_net0;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, his_seq_net0);
+
+ est0->time_stamps.ours_host_byte_order = timestamp_now_host_byte_order;
+ tcp0->options.time_stamp.my_time_stamp = timestamp_now_net_byte_order;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, timestamp_now_net_byte_order);
+
+ tcp0->options.time_stamp.his_time_stamp = est0->time_stamps.his_net_byte_order;
+ tcp_sum0 = ip_csum_add_even (tcp_sum0, est0->time_stamps.his_net_byte_order);
+
+ tcp0->header.checksum = ip_csum_fold (tcp_sum0);
+
+ est0->flags &= ~TCP_CONNECTION_FLAG_ack_pending;
+
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_connections_left -= 1;
+ n_acks += 1;
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ vlib_error_count (vm, error_node->node_index, TCP_ERROR_ACKS_SENT, n_acks);
+
+ return n_acks;
+}
+
+static uword
+ip4_tcp_output (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_output (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_output (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_output (vm, node, frame, /* is_ip6 */ 1); }
+
+VLIB_REGISTER_NODE (ip4_tcp_output_node,static) = {
+ .function = ip4_tcp_output,
+ .name = "ip4-tcp-output",
+ .state = VLIB_NODE_STATE_DISABLED,
+ .type = VLIB_NODE_TYPE_INPUT,
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_output_node,static) = {
+ .function = ip6_tcp_output,
+ .name = "ip6-tcp-output",
+ .state = VLIB_NODE_STATE_DISABLED,
+ .type = VLIB_NODE_TYPE_INPUT,
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup",
+ },
+};
+
+static_always_inline void
+tcp_ack (tcp_main_t * tm, tcp_connection_t * c, u32 n_bytes)
+{
+ ASSERT (n_bytes == 0);
+}
+
+typedef enum {
+ TCP_ESTABLISHED_NEXT_DROP,
+ TCP_ESTABLISHED_N_NEXT,
+} tcp_established_next_t;
+
+static_always_inline uword
+ip46_tcp_established (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ tcp_ip_4_or_6_t is_ip6)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ uword n_packets = frame->n_vectors;
+ u32 * from, * to_next;
+ u32 n_left_from, n_left_to_next, next, timestamp_now;
+ vlib_node_runtime_t * error_node;
+
+ error_node = vlib_node_get_runtime
+ (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index);
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = n_packets;
+ next = node->cached_next_index;
+ timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * p0;
+ ip6_header_t * ip60;
+ ip4_header_t * ip40;
+ tcp_header_t * tcp0;
+ tcp_connection_t * est0;
+ tcp_listener_t * l0;
+ u32 bi0, iest0, n_data_bytes0, his_ack_host0, n_ack0;
+ u8 error0, next0, n_advance_bytes0, is_fin0, send_ack0;
+
+ bi0 = to_next[0] = from[0];
+
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer (vm, bi0);
+
+ if (is_ip6)
+ {
+ ip60 = vlib_buffer_get_current (p0);
+ tcp0 = ip6_next_header (ip60);
+ ASSERT (ip60->protocol == IP_PROTOCOL_TCP);
+ n_advance_bytes0 = tcp_header_bytes (tcp0);
+ n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - n_advance_bytes0;
+ n_advance_bytes0 += sizeof (ip60[0]);
+ }
+ else
+ {
+ ip40 = vlib_buffer_get_current (p0);
+ tcp0 = ip4_next_header (ip40);
+ n_advance_bytes0 = (ip4_header_bytes (ip40)
+ + tcp_header_bytes (tcp0));
+ n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0;
+ }
+
+ iest0 = vnet_buffer (p0)->ip.tcp.established_connection_index;
+ est0 = vec_elt_at_index (tm46->established_connections, iest0);
+
+ error0 = TCP_ERROR_NO_DATA;
+ next0 = TCP_ESTABLISHED_NEXT_DROP;
+
+ if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->seq_number)
+ != est0->sequence_numbers.his))
+ goto unexpected_seq_number0;
+ if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->ack_number) - est0->sequence_numbers.ours
+ > est0->n_tx_unacked_bytes))
+ goto unexpected_ack_number0;
+
+ is_fin0 = (tcp0->flags & TCP_FLAG_FIN) != 0;
+
+ if (PREDICT_FALSE ((est0->flags & TCP_CONNECTION_FLAG_fin_received)
+ && (is_fin0 || n_data_bytes0 > 0)))
+ goto already_received_fin0;
+
+ /* Update window. */
+ est0->his_window = clib_net_to_host_u16 (tcp0->window);
+
+ /* Update his sequence number to account for data he's just sent. */
+ est0->sequence_numbers.his += n_data_bytes0 + is_fin0;
+
+ his_ack_host0 = clib_net_to_host_u32 (tcp0->ack_number);
+ n_ack0 = his_ack_host0 - est0->sequence_numbers.ours;
+ tcp_ack (tm, est0, n_ack0);
+ est0->sequence_numbers.ours = his_ack_host0;
+
+ {
+ u32 t = tcp_options_decode_for_ack (tm, tcp0, &est0->time_stamps.his_net_byte_order);
+ if (t != est0->time_stamps.ours_host_byte_order)
+ {
+ f64 dt = (timestamp_now - t) * tm->secs_per_tick[TCP_TIMER_timestamp];
+ est0->round_trip_time_stats.sum += dt;
+ est0->round_trip_time_stats.sum2 += dt*dt;
+ est0->round_trip_time_stats.count += 1;
+ est0->time_stamps.ours_host_byte_order = t;
+
+ {
+ ELOG_TYPE_DECLARE (e) = {
+ .format = "ack rtt: %.4e",
+ .format_args = "f8",
+ };
+ struct { f64 dt; } * ed;
+ ed = ELOG_DATA (&vm->elog_main, e);
+ ed->dt = dt;
+ }
+ }
+ }
+
+ send_ack0 = ((est0->flags & TCP_CONNECTION_FLAG_ack_pending) == 0
+ && (n_data_bytes0 > 0 || is_fin0));
+ vec_add1 (tm46->connections_pending_acks, vnet_buffer (p0)->ip.tcp.established_connection_index);
+ _vec_len (tm46->connections_pending_acks) -= ! send_ack0;
+ est0->flags |= send_ack0 << LOG2_TCP_CONNECTION_FLAG_ack_pending;
+
+ est0->flags |= is_fin0 << LOG2_TCP_CONNECTION_FLAG_fin_received;
+
+ l0 = pool_elt_at_index (tm->listener_pool, vnet_buffer (p0)->ip.tcp.listener_index);
+
+ {
+ u32 ch0 = tcp_connection_handle_set (iest0, is_ip6);
+
+ vec_add1 (l0->eof_connections[is_ip6], ch0);
+ _vec_len (l0->eof_connections[is_ip6]) -= ! is_fin0;
+
+ vec_add1 (l0->close_connections[is_ip6], ch0);
+ _vec_len (l0->close_connections[is_ip6]) -= !(est0->flags & TCP_CONNECTION_FLAG_fin_sent);
+ }
+
+ next0 = n_data_bytes0 > 0 ? l0->next_index : next0;
+
+ vlib_buffer_advance (p0, n_advance_bytes0);
+
+ enqueue0:
+ p0->error = error_node->errors[error0];
+ if (PREDICT_FALSE (next0 != next))
+ {
+ to_next -= 1;
+ n_left_to_next += 1;
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+
+ next = next0;
+ vlib_get_next_frame (vm, node, next, to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ continue;
+
+ unexpected_seq_number0:
+ next0 = TCP_ESTABLISHED_NEXT_DROP;
+ error0 = TCP_ERROR_UNEXPECTED_SEQ_NUMBER;
+ goto enqueue0;
+
+ unexpected_ack_number0:
+ next0 = TCP_ESTABLISHED_NEXT_DROP;
+ error0 = TCP_ERROR_UNEXPECTED_ACK_NUMBER;
+ goto enqueue0;
+
+ already_received_fin0:
+ next0 = TCP_ESTABLISHED_NEXT_DROP;
+ error0 = TCP_ERROR_SEGMENT_AFTER_FIN;
+ goto enqueue0;
+ }
+
+ vlib_put_next_frame (vm, node, next, n_left_to_next);
+ }
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ /* FIXME */ ;
+
+ return frame->n_vectors;
+}
+
+static uword
+ip4_tcp_established (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_established (vm, node, frame, /* is_ip6 */ 0); }
+
+static uword
+ip6_tcp_established (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{ return ip46_tcp_established (vm, node, frame, /* is_ip6 */ 1); }
+
+VLIB_REGISTER_NODE (ip4_tcp_established_node,static) = {
+ .function = ip4_tcp_established,
+ .name = "ip4-tcp-established",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
+ .next_nodes = {
+ [TCP_ESTABLISHED_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE (ip6_tcp_established_node,static) = {
+ .function = ip6_tcp_established,
+ .name = "ip6-tcp-established",
+
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
+ .next_nodes = {
+ [TCP_ESTABLISHED_NEXT_DROP] = "error-drop",
+ },
+};
+
+uword
+tcp_register_listener (vlib_main_t * vm,
+ tcp_listener_registration_t * r)
+{
+ tcp_main_t * tm = &tcp_main;
+ tcp_listener_t * l;
+
+ {
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, tcp_udp_lookup_init)))
+ clib_error_report (error);
+ }
+
+ pool_get_aligned (tm->listener_pool, l, CLIB_CACHE_LINE_BYTES);
+
+ memset (l, 0, sizeof (l[0]));
+
+ l->dst_port = r->port;
+ l->next_index = vlib_node_add_next (vm, ip4_tcp_established_node.index, r->data_node_index);
+ l->valid_local_adjacency_bitmap = 0;
+ l->flags = r->flags & (TCP_LISTENER_IP4 | TCP_LISTENER_IP6);
+
+ tm->listener_index_by_dst_port[clib_host_to_net_u16 (l->dst_port)] = l - tm->listener_pool;
+
+ return l - tm->listener_pool;
+}
+
+static void
+tcp_udp_lookup_ip4_add_del_interface_address (ip4_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip4_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_delete)
+{
+ tcp_main_t * tm = &tcp_main;
+
+ tm->ip4.default_valid_local_adjacency_bitmap
+ = clib_bitmap_set (tm->ip4.default_valid_local_adjacency_bitmap,
+ if_address_index,
+ is_delete ? 0 : 1);
+}
+
+static void
+tcp_udp_lookup_ip6_add_del_interface_address (ip6_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip6_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_delete)
+{
+ tcp_main_t * tm = &tcp_main;
+
+ tm->ip6.default_valid_local_adjacency_bitmap
+ = clib_bitmap_set (tm->ip6.default_valid_local_adjacency_bitmap,
+ if_address_index,
+ is_delete ? 0 : 1);
+}
+
+static clib_error_t *
+tcp_udp_lookup_init (vlib_main_t * vm)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip4_main_t * im4 = &ip4_main;
+ ip6_main_t * im6 = &ip6_main;
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
+ return error;
+ if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
+ return error;
+
+ tcp_time_init (vm, tm);
+
+ {
+ ip4_add_del_interface_address_callback_t cb;
+
+ cb.function = tcp_udp_lookup_ip4_add_del_interface_address;
+ cb.function_opaque = 0;
+ vec_add1 (im4->add_del_interface_address_callbacks, cb);
+ }
+
+ {
+ ip6_add_del_interface_address_callback_t cb;
+
+ cb.function = tcp_udp_lookup_ip6_add_del_interface_address;
+ cb.function_opaque = 0;
+ vec_add1 (im6->add_del_interface_address_callbacks, cb);
+ }
+
+ tm->ip4.output_node_index = ip4_tcp_output_node.index;
+ tm->ip6.output_node_index = ip6_tcp_output_node.index;
+
+ tcp_lookup_init (vm, tm);
+ tcp_options_decode_init (tm);
+
+ tm->tx_buffer_free_list = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+ tm->tx_buffer_free_list_n_buffer_bytes = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tcp_udp_lookup_init);
+
+static u8 * format_tcp_time_stamp (u8 * s, va_list * va)
+{
+ tcp_timer_type_t type = va_arg (*va, tcp_timer_type_t);
+ u32 value = va_arg (*va, u32);
+ vlib_main_t * vm = vlib_get_main();
+ tcp_main_t * tm = &tcp_main;
+ u64 now;
+ f64 dt;
+
+ now = clib_cpu_time_now ();
+ dt = vm->clib_time.seconds_per_clock * (now - (value << tm->log2_clocks_per_tick[type]));
+ return format (s, "%.4e sec", dt);
+}
+
+static u8 * format_tcp_connection_state (u8 * s, va_list * va)
+{
+ tcp_connection_state_t st = va_arg (*va, tcp_connection_state_t);
+ char * t = 0;
+ switch (st)
+ {
+#define _(f) case TCP_CONNECTION_STATE_##f: t = #f; break;
+ foreach_tcp_connection_state
+#undef _
+ default: break;
+ }
+ if (t)
+ s = format (s, "%s", t);
+ else
+ s = format (s, "unknown 0x%x", st);
+
+ return s;
+}
+
+static u8 * format_tcp_ip_4_or_6 (u8 * s, va_list * va)
+{
+ tcp_ip_4_or_6_t is_ip6 = va_arg (*va, tcp_ip_4_or_6_t);
+ return format (s, "%s", is_ip6 ? "ip6" : "ip4");
+}
+
+static u8 * format_tcp_mini_connection (u8 * s, va_list * va)
+{
+ tcp_mini_connection_t * c = va_arg (*va, tcp_mini_connection_t *);
+
+ s = format (s, "state %U, window scale %d, mss %d",
+ format_tcp_connection_state, c->state,
+ c->window_scale, c->max_segment_size);
+
+ return s;
+}
+
+static u8 * format_ip4_tcp_mini_connection (u8 * s, va_list * va)
+{
+ u32 imin = va_arg (*va, u32);
+ u32 imin_div, imin_mod;
+ tcp_main_t * tm = &tcp_main;
+ tcp_mini_connection_t * min;
+ ip4_tcp_udp_address_x4_and_timestamps_t * mina;
+
+ imin_div = imin / 4;
+ imin_mod = imin % 4;
+
+ mina = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin_div);
+
+ s = format (s, "%U, age %U",
+ format_ip4_tcp_udp_address_x4, &mina->address_x4, imin_div,
+ format_tcp_time_stamp, TCP_TIMER_mini_connection, mina->time_stamps[imin_div]);
+
+ min = vec_elt_at_index (tm->ip4.mini_connections, imin);
+
+ s = format (s, "%U", format_tcp_mini_connection, min);
+
+ return s;
+}
+
+static u8 * format_ip6_tcp_mini_connection (u8 * s, va_list * va)
+{
+ u32 imin = va_arg (*va, u32);
+ u32 imin_div, imin_mod;
+ tcp_main_t * tm = &tcp_main;
+ tcp_mini_connection_t * min;
+ ip6_tcp_udp_address_x4_and_timestamps_t * mina;
+
+ imin_div = imin / 4;
+ imin_mod = imin % 4;
+
+ mina = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin_div);
+
+ s = format (s, "%U, age %U",
+ format_ip6_tcp_udp_address_x4, &mina->address_x4, imin_div,
+ format_tcp_time_stamp, TCP_TIMER_mini_connection, mina->time_stamps[imin_div]);
+
+ min = vec_elt_at_index (tm->ip6.mini_connections, imin);
+
+ s = format (s, "%U", format_tcp_mini_connection, min);
+
+ return s;
+}
+
+static u8 * format_tcp_established_connection (u8 * s, va_list * va)
+{
+ tcp_connection_t * c = va_arg (*va, tcp_connection_t *);
+
+ if (c->flags != 0)
+ {
+ s = format (s, ", flags: ");
+#define _(f) if (c->flags & TCP_CONNECTION_FLAG_##f) s = format (s, "%s, ", #f);
+ foreach_tcp_connection_flag;
+#undef _
+ }
+
+ if (tcp_round_trip_time_stats_is_valid (&c->round_trip_time_stats))
+ {
+ f64 r[2];
+ tcp_round_trip_time_stats_compute (&c->round_trip_time_stats, r);
+ s = format (s, ", rtt %.4e +- %.4e",
+ r[0], r[1]);
+ }
+
+ return s;
+}
+
+static u8 * format_ip4_tcp_established_connection (u8 * s, va_list * va)
+{
+ u32 iest = va_arg (*va, u32);
+ u32 iest_div, iest_mod;
+ tcp_main_t * tm = &tcp_main;
+ tcp_connection_t * est;
+ ip4_tcp_udp_address_x4_t * esta;
+
+ iest_div = iest / 4;
+ iest_mod = iest % 4;
+
+ esta = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div);
+ est = vec_elt_at_index (tm->ip4.established_connections, iest);
+
+ s = format (s, "%U%U",
+ format_ip4_tcp_udp_address_x4, esta, iest_mod,
+ format_tcp_established_connection, est);
+
+ return s;
+}
+
+static u8 * format_ip6_tcp_established_connection (u8 * s, va_list * va)
+{
+ u32 iest = va_arg (*va, u32);
+ u32 iest_div, iest_mod;
+ tcp_main_t * tm = &tcp_main;
+ tcp_connection_t * est;
+ ip6_tcp_udp_address_x4_t * esta;
+
+ iest_div = iest / 4;
+ iest_mod = iest % 4;
+
+ esta = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div);
+ est = vec_elt_at_index (tm->ip6.established_connections, iest);
+
+ s = format (s, "%U%U",
+ format_ip6_tcp_udp_address_x4, esta, iest_mod,
+ format_tcp_established_connection, est);
+
+ return s;
+}
+
+VLIB_CLI_COMMAND (vlib_cli_show_tcp_command, static) = {
+ .path = "show tcp",
+ .short_help = "Transmission control protocol (TCP) show commands",
+};
+
+static clib_error_t *
+show_mini_connections (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46;
+ tcp_ip_4_or_6_t is_ip6 = TCP_IP4;
+ tcp_mini_connection_t * min;
+ ip6_tcp_udp_address_x4_and_timestamps_t * mina6;
+ ip4_tcp_udp_address_x4_and_timestamps_t * mina4;
+ clib_error_t * error = 0;
+ uword i, i0, i1, n_valid;
+
+ if (unformat (input, "4"))
+ is_ip6 = TCP_IP4;
+ if (unformat (input, "6"))
+ is_ip6 = TCP_IP6;
+
+ n_valid = 0;
+ tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ for (i = 0; i <= tm46->mini_connection_hash_mask; i++)
+ {
+ i0 = i / 4;
+ i1 = i % 4;
+
+ min = vec_elt_at_index (tm46->mini_connections, i);
+ if (is_ip6)
+ {
+ mina6 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, i0);
+ if (ip6_tcp_udp_address_x4_is_valid (&mina6->address_x4, i1))
+ {
+ vlib_cli_output (vm, "%U", format_ip4_tcp_mini_connection, i);
+ n_valid += 1;
+ }
+ }
+ else
+ {
+ mina4 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, i0);
+ if (ip4_tcp_udp_address_x4_is_valid (&mina4->address_x4, i1))
+ {
+ vlib_cli_output (vm, "%U", format_ip6_tcp_mini_connection, i);
+ n_valid += 1;
+ }
+ }
+ }
+
+ if (n_valid == 0)
+ vlib_cli_output (vm, "no %U mini tcp connections", format_tcp_ip_4_or_6, is_ip6);
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (vlib_cli_show_tcp_mini_connections_command) = {
+ .path = "show tcp mini-connections",
+ .short_help = "Show not-yet established TCP connections",
+ .function = show_mini_connections,
+};
+
+static clib_error_t *
+show_established_connections (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46;
+ tcp_ip_4_or_6_t is_ip6 = TCP_IP4;
+ tcp_connection_t * est;
+ ip6_tcp_udp_address_x4_t * esta6;
+ ip4_tcp_udp_address_x4_t * esta4;
+ clib_error_t * error = 0;
+ uword i, i0, i1, n_valid;
+
+ if (unformat (input, "4"))
+ is_ip6 = TCP_IP4;
+ if (unformat (input, "6"))
+ is_ip6 = TCP_IP6;
+
+ n_valid = 0;
+ tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ for (i = 0; i < vec_len (tm46->established_connections); i++)
+ {
+ i0 = i / 4;
+ i1 = i % 4;
+
+ est = vec_elt_at_index (tm46->established_connections, i);
+ if (is_ip6)
+ {
+ esta6 = vec_elt_at_index (tm->ip6_established_connection_address_hash, i0);
+ if (ip6_tcp_udp_address_x4_is_valid (esta6, i1))
+ {
+ vlib_cli_output (vm, "%U", format_ip6_tcp_established_connection, i);
+ n_valid += 1;
+ }
+ }
+ else
+ {
+ esta4 = vec_elt_at_index (tm->ip4_established_connection_address_hash, i0);
+ if (ip4_tcp_udp_address_x4_is_valid (esta4, i1))
+ {
+ vlib_cli_output (vm, "%U", format_ip4_tcp_established_connection, i);
+ n_valid += 1;
+ }
+ }
+ }
+
+ if (n_valid == 0)
+ vlib_cli_output (vm, "no %U established tcp connections", format_tcp_ip_4_or_6, is_ip6);
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (vlib_cli_show_tcp_established_connections_command, static) = {
+ .path = "show tcp connections",
+ .short_help = "Show established TCP connections",
+ .function = show_established_connections,
+};
+
+#if 0
+uword
+tcp_write (vlib_main_t * vm, u32 connection_handle, void * data, uword n_data_bytes)
+{
+ tcp_main_t * tm = &tcp_main;
+ tcp_ip_4_or_6_t is_ip6 = tcp_connection_is_ip6 (connection_handle);
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ tcp_connection_t * c = vec_elt_at_index (tm46->established_connections, connection_handle / 2);
+ vlib_buffer_t * b;
+ u32 bi, bi_next, bi_start_of_packet;
+ ip_csum_t sum;
+
+ b = 0;
+ bi = c->write_tail_buffer_index;
+ n_bytes_left_tail = 0;
+ if (bi != 0)
+ {
+ b = vlib_get_buffer (vm, bi);
+ n_bytes_left_tail = tm->tx_buffer_free_list_n_buffer_bytes - b->current_length;
+ }
+
+ n_bytes_this_packet = c->write_tail_packet.n_data_bytes;
+ n_bytes_left_packet = c->max_segment_size - n_bytes_this_packet;
+
+ n_data_left = n_data_bytes;
+ sum = c->write_tail_packet.data_ip_checksum;
+
+ while (n_data_left > 0)
+ {
+ u32 n_copy;
+
+ if (n_bytes_left_tail == 0)
+ {
+ if (! vlib_buffer_alloc_from_free_list (vm, &bi_next, 1,
+ tm->tx_buffer_free_list))
+ return n_data_bytes - n_data_left;
+
+ bi_start_of_packet = bi_next;
+ if (b)
+ {
+ b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b->next_buffer = bi_next;
+ bi_start_of_packet = b->opaque[0];
+ }
+ bi = bi_next;
+ b = vlib_get_buffer (vm, bi);
+
+ /* Save away start of packet buffer in opaque. */
+ b->opaque[0] = bi_start_of_packet;
+
+ c->tail_buffer.buffer_index = bi;
+ n_bytes_left_tail = tm->tx_buffer_free_list_n_buffer_bytes;
+ }
+
+ n_copy = n_data_left;
+ n_copy = clib_min (n_copy, n_bytes_left_tail);
+ n_copy = clib_min (n_copy, n_bytes_left_packet);
+
+ sum = ip_csum_and_memcpy (sum, b->data + b->current_length,
+ data, n_copy);
+
+ b->current_length += n_copy;
+ n_bytes_left_tail -= n_copy;
+ n_bytes_left_packet -= n_copy;
+ n_data_left -=- n_copy;
+ n_bytes_this_packet += n_copy;
+
+ if (n_bytes_left_packet == 0)
+ {
+ bi_start_of_packet = b->opaque[0];
+
+ if (c->tail_packet.buffer_index != 0)
+ {
+ vlib_buffer_t * p = vlib_get_buffer (vm, c->tail_packet.buffer_index);
+ tcp_buffer_t * next = vlib_get_buffer_opaque (p);
+ next[0] = c->;
+ }
+ c->tail_packet.buffer_index = bi_start_of_packet;
+ }
+ }
+
+ c->tail_buffer.buffer_index = bi;
+ c->tail_buffer.n_data_bytes = n_bytes_this_packet;
+ c->tail_buffer.data_ip_checksum = ip_csum_fold (sum);
+
+ return 0;
+}
+#endif
diff --git a/vnet/vnet/ip/tcp.h b/vnet/vnet/ip/tcp.h
new file mode 100644
index 00000000000..98d8e34f0d5
--- /dev/null
+++ b/vnet/vnet/ip/tcp.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp.h: tcp protocol
+ *
+ * Copyright (c) 2011 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_tcp_protocol_h
+#define included_tcp_protocol_h
+
+#include <vppinfra/vector.h>
+
+/* No support for e.g. Altivec. */
+#if defined (__SSE2__)
+#define TCP_HAVE_VEC128
+#endif
+
+typedef union {
+ struct {
+ u16 src, dst;
+ };
+ u32 as_u32;
+} tcp_udp_ports_t;
+
+typedef union {
+#ifdef TCP_HAVE_VEC128
+ u32x4 as_u32x4;
+#endif
+ tcp_udp_ports_t as_ports[4];
+} tcp_udp_ports_x4_t;
+
+typedef struct {
+ union {
+#ifdef TCP_HAVE_VEC128
+ u32x4 as_u32x4;
+#endif
+ ip4_address_t as_ip4_address[4];
+ } src, dst;
+ tcp_udp_ports_x4_t ports;
+} ip4_tcp_udp_address_x4_t;
+
+typedef struct {
+ union {
+#ifdef TCP_HAVE_VEC128
+ u32x4 as_u32x4[4];
+#endif
+ u32 as_u32[4][4];
+ } src, dst;
+ tcp_udp_ports_x4_t ports;
+} ip6_tcp_udp_address_x4_t;
+
+typedef struct {
+ u32 his, ours;
+} tcp_sequence_pair_t;
+
+/* Time stamps saved from options. */
+typedef struct {
+ u32 ours_host_byte_order, his_net_byte_order;
+} tcp_time_stamp_pair_t;
+
+typedef struct {
+ ip4_tcp_udp_address_x4_t address_x4;
+ u32 time_stamps[4];
+} ip4_tcp_udp_address_x4_and_timestamps_t;
+
+typedef struct {
+ ip6_tcp_udp_address_x4_t address_x4;
+ u32 time_stamps[4];
+} ip6_tcp_udp_address_x4_and_timestamps_t;
+
+#define foreach_tcp_connection_state \
+ /* unused */ \
+ _ (unused) \
+ /* Sent SYN-ACK waiting for ACK if he ever feels like sending one. */ \
+ _ (listen_ack_wait) \
+ /* Sent SYN waiting for ACK or RST. */ \
+ _ (connecting) \
+ /* Pseudo-type for established connections. */ \
+ _ (established)
+
+typedef enum {
+#define _(f) TCP_CONNECTION_STATE_##f,
+ foreach_tcp_connection_state
+#undef _
+ TCP_N_CONNECTION_STATE,
+} tcp_connection_state_t;
+
+/* Kept small to fight off syn flood attacks. */
+typedef struct {
+ tcp_sequence_pair_t sequence_numbers;
+
+ tcp_time_stamp_pair_t time_stamps;
+
+ /* segment size and window scale (saved from options
+ or set to defaults). */
+ u16 max_segment_size;
+
+ u8 window_scale;
+
+ tcp_connection_state_t state : 8;
+} tcp_mini_connection_t;
+
+typedef struct {
+ /* Sum and sum^2 of measurements.
+ Used to compute average and RMS. */
+ f64 sum, sum2;
+
+ /* Number of measurements. */
+ f64 count;
+} tcp_round_trip_time_stats_t;
+
+typedef struct {
+ u32 first_buffer_index_this_packet;
+
+ u16 data_ip_checksum;
+
+ u16 n_data_bytes;
+} tcp_tx_packet_t;
+
+typedef struct {
+ tcp_sequence_pair_t sequence_numbers;
+
+ tcp_time_stamp_pair_t time_stamps;
+
+ tcp_tx_packet_t head_packet, tx_tail_packet, write_tail_packet;
+
+ u32 write_tail_buffer_index;
+
+ tcp_round_trip_time_stats_t round_trip_time_stats;
+
+ /* Number of un-acknowledged bytes we've sent. */
+ u32 n_tx_unacked_bytes;
+
+ /* segment size and window scale (saved from options
+ or set to defaults). */
+ u16 max_segment_size;
+
+ /* Window from latest received packet. */
+ u16 his_window;
+
+ u16 my_window;
+
+ u8 his_window_scale;
+
+ u8 my_window_scale;
+
+ /* ip4/ip6 tos/ttl to use for packets we send. */
+ u8 tos, ttl;
+
+ u16 flags;
+#define foreach_tcp_connection_flag \
+ _ (ack_pending) \
+ _ (fin_received) \
+ _ (fin_sent) \
+ _ (application_requested_close)
+
+ u8 listener_opaque[128
+ - 1 * sizeof (tcp_sequence_pair_t)
+ - 1 * sizeof (tcp_time_stamp_pair_t)
+ - 3 * sizeof (tcp_tx_packet_t)
+ - 1 * sizeof (tcp_round_trip_time_stats_t)
+ - 2 * sizeof (u32)
+ - 4 * sizeof (u16)
+ - 4 * sizeof (u8)];
+} tcp_connection_t;
+
+typedef enum {
+ TCP_IP4,
+ TCP_IP6,
+ TCP_N_IP46,
+} tcp_ip_4_or_6_t;
+
+typedef enum {
+#define _(f) LOG2_TCP_CONNECTION_FLAG_##f,
+ foreach_tcp_connection_flag
+#undef _
+ N_TCP_CONNECTION_FLAG,
+#define _(f) TCP_CONNECTION_FLAG_##f = 1 << LOG2_TCP_CONNECTION_FLAG_##f,
+ foreach_tcp_connection_flag
+#undef _
+} tcp_connection_flag_t;
+
+typedef enum {
+ TCP_PACKET_TEMPLATE_SYN,
+ TCP_PACKET_TEMPLATE_SYN_ACK,
+ TCP_PACKET_TEMPLATE_ACK,
+ TCP_PACKET_TEMPLATE_FIN_ACK,
+ TCP_PACKET_TEMPLATE_RST_ACK,
+ TCP_N_PACKET_TEMPLATE,
+} tcp_packet_template_type_t;
+
+typedef struct {
+ vlib_packet_template_t vlib;
+
+ /* TCP checksum of template with zeros for all
+ variable fields. Network byte order. */
+ u16 tcp_checksum_net_byte_order;
+
+ /* IP4 checksum. */
+ u16 ip4_checksum_net_byte_order;
+} tcp_packet_template_t;
+
+typedef struct {
+ u8 log2_n_mini_connection_hash_elts;
+ u8 log2_n_established_connection_hash_elts;
+ u8 is_ip6;
+
+ u32 mini_connection_hash_mask;
+ u32 established_connection_hash_mask;
+
+ uword * established_connection_overflow_hash;
+
+ tcp_mini_connection_t * mini_connections;
+
+ tcp_connection_t * established_connections;
+
+ /* Vector of established connection indices which need ACKs sent. */
+ u32 * connections_pending_acks;
+
+ /* Default valid_local_adjacency_bitmap for listeners who want to listen
+ for a given port in on all interfaces. */
+ uword * default_valid_local_adjacency_bitmap;
+
+ u32 output_node_index;
+
+ tcp_packet_template_t packet_templates[TCP_N_PACKET_TEMPLATE];
+} ip46_tcp_main_t;
+
+#define foreach_tcp_event \
+ /* Received a SYN-ACK after sending a SYN to connect. */ \
+ _ (connection_established) \
+ /* Received a reset (RST) after sending a SYN to connect. */ \
+ _ (connect_failed) \
+ /* Received a FIN from an established connection. */ \
+ _ (fin_received) \
+ _ (connection_closed) \
+ /* Received a reset RST from an established connection. */ \
+ _ (reset_received)
+
+typedef enum {
+#define _(f) TCP_EVENT_##f,
+ foreach_tcp_event
+#undef _
+} tcp_event_type_t;
+
+typedef void (tcp_event_function_t)
+ (u32 * connections,
+ tcp_event_type_t event_type);
+
+typedef struct {
+ /* Bitmap indicating which of local (interface) addresses
+ we should listen on for this destination port. */
+ uword * valid_local_adjacency_bitmap;
+
+ /* Destination tcp/udp port to listen for connections. */
+ u16 dst_port;
+
+ u16 next_index;
+
+ u32 flags;
+
+ /* Connection indices for which event in event_function applies to. */
+ u32 * event_connections[TCP_N_IP46];
+ u32 * eof_connections[TCP_N_IP46];
+ u32 * close_connections[TCP_N_IP46];
+
+ tcp_event_function_t * event_function;
+} tcp_listener_t;
+
+typedef struct {
+ u8 next, error;
+} tcp_lookup_disposition_t;
+
+#define foreach_tcp_timer \
+ /* Used to rank mini connections. */ \
+ _ (mini_connection, 10e-3) \
+ /* Used for timestamps. */ \
+ _ (timestamp, 1e-6)
+
+typedef enum {
+#define _(f,s) TCP_TIMER_##f,
+ foreach_tcp_timer
+#undef _
+ TCP_N_TIMER,
+} tcp_timer_type_t;
+
+typedef struct {
+ ip46_tcp_main_t ip4, ip6;
+
+ /* Array of non-established connections, but soon-to be established connections. */
+ ip4_tcp_udp_address_x4_and_timestamps_t * ip4_mini_connection_address_hash;
+ ip6_tcp_udp_address_x4_and_timestamps_t * ip6_mini_connection_address_hash;
+
+ /* Vector of size log2_n_established_connection_hash_elts plus overflow. */
+ ip4_tcp_udp_address_x4_t * ip4_established_connection_address_hash;
+ ip6_tcp_udp_address_x4_t * ip6_established_connection_address_hash;
+
+ /* Jenkins hash seeds for established and mini hash tables. */
+ u32x4_union_t connection_hash_seeds[2][3];
+ u32x4_union_t connection_hash_masks[2];
+
+ /* Pool of listeners. */
+ tcp_listener_t * listener_pool;
+
+ /* Table mapping destination port to listener index. */
+ u16 * listener_index_by_dst_port;
+
+ tcp_lookup_disposition_t disposition_by_state_and_flags[TCP_N_CONNECTION_STATE][64];
+
+ u8 log2_clocks_per_tick[TCP_N_TIMER];
+
+ f64 secs_per_tick[TCP_N_TIMER];
+
+ /* Holds pointers to default and per-packet TCP options while
+ parsing a TCP packet's options. */
+ tcp_mini_connection_t option_decode_mini_connection_template;
+
+ /* Count of currently established connections. */
+ u32 n_established_connections[TCP_N_IP46];
+
+ u32 tx_buffer_free_list;
+ u32 tx_buffer_free_list_n_buffer_bytes;
+} tcp_main_t;
+
+/* Global TCP main structure. */
+tcp_main_t tcp_main;
+
+typedef struct {
+ /* Listen on this port. */
+ u16 port;
+
+#define TCP_LISTENER_IP4 (1 << 0)
+#define TCP_LISTENER_IP6 (1 << 1)
+ u16 flags;
+
+ /* Next node index for data packets. */
+ u32 data_node_index;
+
+ /* Event function: called on new connections, etc. */
+ tcp_event_function_t * event_function;
+} tcp_listener_registration_t;
+
+uword
+tcp_register_listener (vlib_main_t * vm, tcp_listener_registration_t * r);
+
+always_inline tcp_ip_4_or_6_t
+tcp_connection_is_ip6 (u32 h)
+{ return h & 1; }
+
+always_inline tcp_ip_4_or_6_t
+tcp_connection_handle_set (u32 iest, tcp_ip_4_or_6_t is_ip6)
+{ return is_ip6 + 2*iest; }
+
+always_inline tcp_connection_t *
+tcp_get_connection (u32 connection_handle)
+{
+ u32 iest = connection_handle / 2;
+ tcp_ip_4_or_6_t is_ip6 = tcp_connection_is_ip6 (connection_handle);
+ tcp_main_t * tm = &tcp_main;
+ ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4;
+ return vec_elt_at_index (tm46->established_connections, iest);
+}
+
+#endif /* included_tcp_protocol_h */
diff --git a/vnet/vnet/ip/tcp_format.c b/vnet/vnet/ip/tcp_format.c
new file mode 100644
index 00000000000..afc3dd20c49
--- /dev/null
+++ b/vnet/vnet/ip/tcp_format.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp_format.c: tcp formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+static u8 * format_tcp_flags (u8 * s, va_list * args)
+{
+ int flags = va_arg (*args, int);
+
+#define _(f) if (flags & TCP_FLAG_##f) s = format (s, "%s, ", #f);
+ foreach_tcp_flag
+#undef _
+
+ return s;
+}
+
+/* Format TCP header. */
+u8 * format_tcp_header (u8 * s, va_list * args)
+{
+ tcp_header_t * tcp = va_arg (*args, tcp_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 header_bytes;
+ uword indent;
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (tcp[0]))
+ return format (s, "TCP header truncated");
+
+ indent = format_get_indent (s);
+ indent += 2;
+
+ s = format (s, "TCP: %d -> %d",
+ clib_net_to_host_u16 (tcp->ports.src),
+ clib_net_to_host_u16 (tcp->ports.dst));
+
+ s = format (s, "\n%Useq. tx 0x%08x rx 0x%08x",
+ format_white_space, indent,
+ clib_net_to_host_u32 (tcp->seq_number),
+ clib_net_to_host_u32 (tcp->ack_number));
+
+ s = format (s, "\n%Uflags %U, tcp header: %d bytes",
+ format_white_space, indent,
+ format_tcp_flags, tcp->flags,
+ (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32));
+
+ s = format (s, "\n%Uwindow %d, checksum 0x%04x",
+ format_white_space, indent,
+ clib_net_to_host_u16 (tcp->window),
+ clib_net_to_host_u16 (tcp->checksum));
+
+ header_bytes = tcp_header_bytes (tcp);
+
+ /* Format TCP options. */
+#if 0
+ {
+ u8 * o;
+ u8 * option_start = (void *) (tcp + 1);
+ u8 * option_end = (void *) tcp + header_bytes;
+
+ for (o = option_start; o < option_end; )
+ {
+ u32 length = o[1];
+ switch (o[0])
+ {
+ case TCP_OPTION_END:
+ length = 1;
+ o = option_end;
+ break;
+
+ case TCP_OPTION_NOP:
+ length = 1;
+ break;
+
+ }
+ }
+ }
+#endif
+
+ /* Recurse into next protocol layer. */
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ip_main_t * im = &ip_main;
+ tcp_udp_port_info_t * pi;
+
+ pi = ip_get_tcp_udp_port_info (im, tcp->ports.dst);
+
+ if (pi && pi->format_header)
+ s = format (s, "\n%U%U",
+ format_white_space, indent - 2,
+ pi->format_header,
+ /* next protocol header */ (void*) tcp + header_bytes,
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
diff --git a/vnet/vnet/ip/tcp_init.c b/vnet/vnet/ip/tcp_init.c
new file mode 100644
index 00000000000..3e88d87e11e
--- /dev/null
+++ b/vnet/vnet/ip/tcp_init.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp_init.c: tcp initialization
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/tcp_packet.h>
+
+static clib_error_t *
+tcp_init (vlib_main_t * vm)
+{
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+ clib_error_t * error;
+
+ error = vlib_call_init_function (vm, ip_main_init);
+
+ if (! error)
+ {
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP);
+ pi->format_header = format_tcp_header;
+
+ pi->unformat_pg_edit = unformat_pg_tcp_header;
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tcp_init);
diff --git a/vnet/vnet/ip/tcp_packet.h b/vnet/vnet/ip/tcp_packet.h
new file mode 100644
index 00000000000..ebb111572a0
--- /dev/null
+++ b/vnet/vnet/ip/tcp_packet.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip4/tcp_packet.h: TCP packet format (see RFC 793)
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_tcp_packet_h
+#define included_tcp_packet_h
+
+/* TCP flags bit 0 first. */
+#define foreach_tcp_flag \
+ _ (FIN) \
+ _ (SYN) \
+ _ (RST) \
+ _ (PSH) \
+ _ (ACK) \
+ _ (URG) \
+ _ (ECE) \
+ _ (CWR)
+
+enum {
+#define _(f) TCP_FLAG_BIT_##f,
+ foreach_tcp_flag
+#undef _
+ TCP_N_FLAG_BITS,
+
+#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f,
+ foreach_tcp_flag
+#undef _
+};
+
+typedef struct {
+ /* Source and destination port. */
+ union {
+ struct {
+ u16 src, dst;
+ };
+ u32 src_and_dst;
+ } ports;
+
+ /* Sequence and acknowledgment number. */
+ u32 seq_number, ack_number;
+
+ /* Size of TCP header in 32-bit units plus 4 reserved bits. */
+ u8 tcp_header_u32s_and_reserved;
+
+ /* see foreach_tcp_flag for enumation of tcp flags. */
+ u8 flags;
+
+ /* Current window advertised by sender.
+ This is the number of bytes sender is willing to receive
+ right now. */
+ u16 window;
+
+ /* Checksum of TCP pseudo header and data. */
+ u16 checksum;
+
+ u16 urgent_pointer;
+} tcp_header_t;
+
+always_inline int
+tcp_header_bytes (tcp_header_t * t)
+{ return (t->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); }
+
+/* TCP options. */
+typedef enum tcp_option_type {
+ TCP_OPTION_END = 0,
+ TCP_OPTION_NOP = 1,
+ TCP_OPTION_MSS = 2,
+ TCP_OPTION_WINDOW_SCALE = 3,
+ TCP_OPTION_SACK_PERMITTED = 4,
+ TCP_OPTION_SACK_BLOCK = 5,
+ TCP_OPTION_TIME_STAMP = 8,
+} tcp_option_type_t;
+
+/* All except NOP and END have 1 byte length field. */
+typedef struct {
+ tcp_option_type_t type : 8;
+
+ /* Length of this option in bytes. */
+ u8 length;
+} tcp_option_with_length_t;
+
+#endif /* included_tcp_packet_h */
+
diff --git a/vnet/vnet/ip/tcp_pg.c b/vnet/vnet/ip/tcp_pg.c
new file mode 100644
index 00000000000..122592d1594
--- /dev/null
+++ b/vnet/vnet/ip/tcp_pg.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/tcp_pg: TCP packet-generator interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+#include <vnet/pg/pg.h>
+
+static void
+tcp_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_offset, tcp_offset;
+
+ tcp_offset = g->start_byte_offset;
+ ip_offset = (g-1)->start_byte_offset;
+
+ while (n_packets >= 1)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ tcp_header_t * tcp0;
+ ip_csum_t sum0;
+ u32 tcp_len0;
+
+ p0 = vlib_get_buffer (vm, packets[0]);
+ n_packets -= 1;
+ packets += 1;
+
+ ASSERT (p0->current_data == 0);
+ ip0 = (void *) (p0->data + ip_offset);
+ tcp0 = (void *) (p0->data + tcp_offset);
+ tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]);
+
+ /* Initialize checksum with header. */
+ if (BITS (sum0) == 32)
+ {
+ sum0 = clib_mem_unaligned (&ip0->src_address, u32);
+ sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32));
+ }
+ else
+ sum0 = clib_mem_unaligned (&ip0->src_address, u64);
+
+ sum0 = ip_csum_with_carry
+ (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16)));
+
+ /* Invalidate possibly old checksum. */
+ tcp0->checksum = 0;
+
+ sum0 = ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0);
+
+ tcp0->checksum = ~ ip_csum_fold (sum0);
+ }
+}
+
+typedef struct {
+ struct { pg_edit_t src, dst; } ports;
+ pg_edit_t seq_number, ack_number;
+ pg_edit_t tcp_header_u32s;
+#define _(f) pg_edit_t f##_flag;
+ foreach_tcp_flag
+#undef _
+ pg_edit_t window;
+ pg_edit_t checksum;
+ pg_edit_t urgent_pointer;
+} pg_tcp_header_t;
+
+static inline void
+pg_tcp_header_init (pg_tcp_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, tcp_header_t, f);
+ _ (ports.src);
+ _ (ports.dst);
+ _ (seq_number);
+ _ (ack_number);
+ _ (window);
+ _ (checksum);
+ _ (urgent_pointer);
+#undef _
+
+ /* Initialize bit fields. */
+#define _(f) \
+ pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \
+ flags, \
+ TCP_FLAG_BIT_##f, 1);
+
+ foreach_tcp_flag
+#undef _
+
+ pg_edit_init_bitfield (&p->tcp_header_u32s, tcp_header_t,
+ tcp_header_u32s_and_reserved,
+ 4, 4);
+}
+
+uword
+unformat_pg_tcp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_tcp_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t),
+ &group_index);
+ pg_tcp_header_init (p);
+
+ /* Defaults. */
+ pg_edit_set_fixed (&p->seq_number, 0);
+ pg_edit_set_fixed (&p->ack_number, 0);
+
+ pg_edit_set_fixed (&p->tcp_header_u32s, sizeof (tcp_header_t) / sizeof (u32));
+
+ pg_edit_set_fixed (&p->window, 4096);
+ pg_edit_set_fixed (&p->urgent_pointer, 0);
+
+#define _(f) pg_edit_set_fixed (&p->f##_flag, 0);
+ foreach_tcp_flag
+#undef _
+
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+
+ if (! unformat (input, "TCP: %U -> %U",
+ unformat_pg_edit,
+ unformat_tcp_udp_port, &p->ports.src,
+ unformat_pg_edit,
+ unformat_tcp_udp_port, &p->ports.dst))
+ goto error;
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "window %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->window))
+ ;
+
+ else if (unformat (input, "checksum %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->checksum))
+ ;
+
+ /* Flags. */
+#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1);
+ foreach_tcp_flag
+#undef _
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ {
+ ip_main_t * im = &ip_main;
+ u16 dst_port;
+ tcp_udp_port_info_t * pi;
+
+ pi = 0;
+ if (p->ports.dst.type == PG_EDIT_FIXED)
+ {
+ dst_port = pg_edit_get_value (&p->ports.dst, PG_EDIT_LO);
+ pi = ip_get_tcp_udp_port_info (im, dst_port);
+ }
+
+ if (pi && pi->unformat_pg_edit
+ && unformat_user (input, pi->unformat_pg_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = tcp_pg_edit_function;
+ g->edit_function_opaque = 0;
+ }
+
+ return 1;
+ }
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
diff --git a/vnet/vnet/ip/udp.h b/vnet/vnet/ip/udp.h
new file mode 100644
index 00000000000..65eef29cb10
--- /dev/null
+++ b/vnet/vnet/ip/udp.h
@@ -0,0 +1,113 @@
+/*
+ * ip/udp.h: udp protocol
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_udp_h
+#define included_udp_h
+
+#include <vnet/vnet.h>
+#include <vnet/ip/udp_packet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ip/format.h>
+
+typedef enum {
+#define udp_error(n,s) UDP_ERROR_##n,
+#include <vnet/ip/udp_error.def>
+#undef udp_error
+ UDP_N_ERROR,
+} udp_error_t;
+
+#define foreach_udp4_dst_port \
+_ (67, dhcp_to_server) \
+_ (68, dhcp_to_client) \
+_ (500, ikev2) \
+_ (4341, lisp_gpe) \
+_ (4739, ipfix) \
+_ (4789, vxlan) \
+_ (4790, vxlan_gpe) \
+_ (6633, vpath_3)
+
+
+#define foreach_udp6_dst_port \
+_ (547, dhcpv6_to_server) \
+_ (546, dhcpv6_to_client) \
+_ (6633, vpath6_3)
+
+typedef enum {
+#define _(n,f) UDP_DST_PORT_##f = n,
+ foreach_udp4_dst_port
+ foreach_udp6_dst_port
+#undef _
+} udp_dst_port_t;
+
+typedef enum {
+#define _(n,f) UDP6_DST_PORT_##f = n,
+ foreach_udp6_dst_port
+#undef _
+} udp6_dst_port_t;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* GRE protocol type in host byte order. */
+ udp_dst_port_t dst_port;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} udp_dst_port_info_t;
+
+typedef enum {
+ UDP_IP6 = 0,
+ UDP_IP4, /* the code is full of is_ip4... */
+ N_UDP_AF,
+} udp_af_t;
+
+typedef struct {
+ udp_dst_port_info_t * dst_port_infos [N_UDP_AF];
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * dst_port_info_by_name[N_UDP_AF];
+ uword * dst_port_info_by_dst_port[N_UDP_AF];
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+} udp_main_t;
+
+always_inline udp_dst_port_info_t *
+udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4)
+{
+ uword * p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port);
+ return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0;
+}
+
+format_function_t format_udp_header;
+format_function_t format_udp_rx_trace;
+
+unformat_function_t unformat_udp_header;
+
+void udp_register_dst_port (vlib_main_t * vm,
+ udp_dst_port_t dst_port,
+ u32 node_index, u8 is_ip4);
+
+#endif /* included_udp_h */
+
diff --git a/vnet/vnet/ip/udp_error.def b/vnet/vnet/ip/udp_error.def
new file mode 100644
index 00000000000..46e3bd9ef47
--- /dev/null
+++ b/vnet/vnet/ip/udp_error.def
@@ -0,0 +1,20 @@
+/*
+ * udp_error.def: gre errors
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+udp_error (NONE, "no error")
+udp_error (NO_LISTENER, "no listener for dst port")
+udp_error (LENGTH_ERROR, "UDP packets with length errors")
diff --git a/vnet/vnet/ip/udp_format.c b/vnet/vnet/ip/udp_format.c
new file mode 100644
index 00000000000..dd54095908c
--- /dev/null
+++ b/vnet/vnet/ip/udp_format.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/udp_format.c: udp formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+/* Format UDP header. */
+u8 * format_udp_header (u8 * s, va_list * args)
+{
+ udp_header_t * udp = va_arg (*args, udp_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ uword indent;
+ u32 header_bytes = sizeof (udp[0]);
+
+ /* Nothing to do. */
+ if (max_header_bytes < sizeof (udp[0]))
+ return format (s, "UDP header truncated");
+
+ indent = format_get_indent (s);
+ indent += 2;
+
+ s = format (s, "UDP: %d -> %d",
+ clib_net_to_host_u16 (udp->src_port),
+ clib_net_to_host_u16 (udp->dst_port));
+
+ s = format (s, "\n%Ulength %d, checksum 0x%04x",
+ format_white_space, indent,
+ clib_net_to_host_u16 (udp->length),
+ clib_net_to_host_u16 (udp->checksum));
+
+ /* Recurse into next protocol layer. */
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ip_main_t * im = &ip_main;
+ tcp_udp_port_info_t * pi;
+
+ pi = ip_get_tcp_udp_port_info (im, udp->dst_port);
+
+ if (pi && pi->format_header)
+ s = format (s, "\n%U%U",
+ format_white_space, indent - 2,
+ pi->format_header,
+ /* next protocol header */ (udp + 1),
+ max_header_bytes - sizeof (udp[0]));
+ }
+
+ return s;
+}
diff --git a/vnet/vnet/ip/udp_init.c b/vnet/vnet/ip/udp_init.c
new file mode 100644
index 00000000000..40ca032923c
--- /dev/null
+++ b/vnet/vnet/ip/udp_init.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/udp_init.c: udp initialization
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/ip/ip.h>
+
+clib_error_t *
+udp_init (vlib_main_t * vm)
+{
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+ clib_error_t * error;
+
+ error = vlib_call_init_function (vm, ip_main_init);
+
+ if (! error)
+ {
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP);
+ if (pi == 0)
+ return clib_error_return (0, "UDP protocol info AWOL");
+ pi->format_header = format_udp_header;
+ pi->unformat_pg_edit = unformat_pg_udp_header;
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (udp_init);
diff --git a/vnet/vnet/ip/udp_local.c b/vnet/vnet/ip/udp_local.c
new file mode 100644
index 00000000000..c9355d2a322
--- /dev/null
+++ b/vnet/vnet/ip/udp_local.c
@@ -0,0 +1,508 @@
+/*
+ * node.c: udp packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ip/udp.h>
+#include <vnet/ip/udp_packet.h>
+#include <vppinfra/sparse_vec.h>
+
+udp_main_t udp_main;
+
+#define foreach_udp_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) UDP_INPUT_NEXT_##s,
+ foreach_udp_input_next
+#undef _
+ UDP_INPUT_N_NEXT,
+} udp_input_next_t;
+
+typedef struct {
+ u16 src_port;
+ u16 dst_port;
+} udp_rx_trace_t;
+
+u8 * format_udp_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ udp_rx_trace_t * t = va_arg (*args, udp_rx_trace_t *);
+
+ s = format (s, "UDP: src-port %d dst-port %d",
+ clib_net_to_host_u16(t->src_port),
+ clib_net_to_host_u16(t->dst_port));
+ return s;
+}
+
+typedef struct {
+ /* Sparse vector mapping udp dst_port in network byte order
+ to next index. */
+ u16 * next_by_dst_port;
+
+ u32 * sparse_index_by_next_index;
+} udp_input_runtime_t;
+
+vlib_node_registration_t udp4_input_node;
+vlib_node_registration_t udp6_input_node;
+
+always_inline uword
+udp46_input_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame,
+ int is_ip4)
+{
+ udp_input_runtime_t * rt = is_ip4 ?
+ (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index)
+ : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index);
+ __attribute__((unused)) u32 n_left_from, next_index, i_next, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+ i_next = vec_elt (rt->sparse_index_by_next_index, next_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ udp_header_t * h0 = 0, * h1 = 0;
+ u32 i0, i1, dst_port0, dst_port1;
+ u32 advance0, advance1;
+ u32 error0, next0, error1, next1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* ip4/6_local hands us the ip header, not the udp header */
+ if (is_ip4)
+ {
+ advance0 = sizeof(ip4_header_t);
+ advance1 = sizeof(ip4_header_t);
+ }
+ else
+ {
+ advance0 = sizeof(ip6_header_t);
+ advance1 = sizeof(ip6_header_t);
+ }
+
+ if (PREDICT_FALSE(b0->current_length < advance0 + sizeof (h0)))
+ {
+ error0 = UDP_ERROR_LENGTH_ERROR;
+ next0 = UDP_INPUT_NEXT_DROP;
+ }
+ else
+ {
+ vlib_buffer_advance (b0, advance0);
+ h0 = vlib_buffer_get_current (b0);
+ error0 = next0 = 0;
+ }
+
+ if (PREDICT_FALSE(b1->current_length < advance1 + sizeof (h1)))
+ {
+ error1 = UDP_ERROR_LENGTH_ERROR;
+ next1 = UDP_INPUT_NEXT_DROP;
+ }
+ else
+ {
+ vlib_buffer_advance (b1, advance1);
+ h1 = vlib_buffer_get_current (b1);
+ error1 = next1 = 0;
+ }
+
+
+ /* Index sparse array with network byte order. */
+ dst_port0 = (error0 == 0) ? h0->dst_port : 0;
+ dst_port1 = (error1 == 0) ? h1->dst_port : 0;
+ sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1,
+ &i0, &i1);
+ next0 = (error0 == 0) ? vec_elt(rt->next_by_dst_port, i0) : next0;
+ next1 = (error1 == 0) ? vec_elt(rt->next_by_dst_port, i1) : next1;
+
+ if (PREDICT_TRUE (error0 == 0))
+ b0->error = node->errors[next0 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE];
+ if (PREDICT_TRUE (error1 == 0))
+ b1->error = node->errors[next1 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE];
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ udp_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR])
+ {
+ tr->src_port = h0->src_port;
+ tr->dst_port = h0->dst_port;
+ }
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ udp_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR])
+ {
+ tr->src_port = h1->src_port;
+ tr->dst_port = h1->dst_port;
+ }
+ }
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ udp_header_t * h0 = 0;
+ u32 i0, next0;
+ u32 advance0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* ip4/6_local hands us the ip header, not the udp header */
+ if (is_ip4)
+ advance0 = sizeof(ip4_header_t);
+ else
+ advance0 = sizeof(ip6_header_t);
+
+ if (PREDICT_FALSE(b0->current_length < advance0 + sizeof (h0)))
+ {
+ b0->error = node->errors[UDP_ERROR_LENGTH_ERROR];
+ next0 = UDP_INPUT_NEXT_DROP;
+ goto trace_x1;
+ }
+
+ vlib_buffer_advance (b0, advance0);
+
+ h0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_TRUE
+ (clib_net_to_host_u16(h0->length) <= b0->current_length))
+ {
+ i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port);
+ next0 = vec_elt(rt->next_by_dst_port, i0);
+
+ b0->error = node->errors [next0 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE];
+ }
+ else
+ {
+ b0->error = node->errors[UDP_ERROR_LENGTH_ERROR];
+ next0 = UDP_INPUT_NEXT_DROP;
+ }
+
+ trace_x1:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ udp_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR])
+ {
+ tr->src_port = h0->src_port;
+ tr->dst_port = h0->dst_port;
+ }
+ }
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return from_frame->n_vectors;
+}
+
+static char * udp_error_strings[] = {
+#define udp_error(n,s) s,
+#include "udp_error.def"
+#undef udp_error
+};
+
+static uword
+udp4_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */);
+}
+
+static uword
+udp6_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */);
+}
+
+
+VLIB_REGISTER_NODE (udp4_input_node) = {
+ .function = udp4_input,
+ .name = "ip4-udp-lookup",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof (udp_input_runtime_t),
+
+ .n_errors = UDP_N_ERROR,
+ .error_strings = udp_error_strings,
+
+ .n_next_nodes = UDP_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [UDP_INPUT_NEXT_##s] = n,
+ foreach_udp_input_next
+#undef _
+ },
+
+ .format_buffer = format_udp_header,
+ .format_trace = format_udp_rx_trace,
+ .unformat_buffer = unformat_udp_header,
+};
+
+VLIB_REGISTER_NODE (udp6_input_node) = {
+ .function = udp6_input,
+ .name = "ip6-udp-lookup",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof (udp_input_runtime_t),
+
+ .n_errors = UDP_N_ERROR,
+ .error_strings = udp_error_strings,
+
+ .n_next_nodes = UDP_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [UDP_INPUT_NEXT_##s] = n,
+ foreach_udp_input_next
+#undef _
+ },
+
+ .format_buffer = format_udp_header,
+ .format_trace = format_udp_rx_trace,
+ .unformat_buffer = unformat_udp_header,
+};
+
+static void add_dst_port (udp_main_t * um,
+ udp_dst_port_t dst_port,
+ char * dst_port_name, u8 is_ip4)
+{
+ udp_dst_port_info_t * pi;
+ u32 i;
+
+ vec_add2 (um->dst_port_infos[is_ip4], pi, 1);
+ i = pi - um->dst_port_infos[is_ip4];
+
+ pi->name = dst_port_name;
+ pi->dst_port = dst_port;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i);
+
+ if (pi->name)
+ hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i);
+}
+
+void
+udp_register_dst_port (vlib_main_t * vm,
+ udp_dst_port_t dst_port,
+ u32 node_index, u8 is_ip4)
+{
+ udp_main_t * um = &udp_main;
+ udp_dst_port_info_t * pi;
+ udp_input_runtime_t * rt;
+ u16 * n;
+ u32 i;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, udp_local_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = udp_get_dst_port_info (um, dst_port, is_ip4);
+ if (! pi)
+ {
+ add_dst_port (um, dst_port, 0, is_ip4);
+ pi = udp_get_dst_port_info (um, dst_port, is_ip4);
+ ASSERT (pi);
+ }
+
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ is_ip4 ? udp4_input_node.index
+ : udp6_input_node.index,
+ node_index);
+
+ /* Setup udp protocol -> next index sparse vector mapping. */
+ rt = vlib_node_get_runtime_data
+ (vm, is_ip4 ? udp4_input_node.index: udp6_input_node.index);
+ n = sparse_vec_validate (rt->next_by_dst_port,
+ clib_host_to_net_u16 (dst_port));
+ n[0] = pi->next_index;
+
+ /* Rebuild next index -> sparse index inverse mapping when sparse vector
+ is updated. */
+ vec_validate (rt->sparse_index_by_next_index, pi->next_index);
+ for (i = 1; i < vec_len (rt->next_by_dst_port); i++)
+ rt->sparse_index_by_next_index[rt->next_by_dst_port[i]] = i;
+}
+
+/* Parse a UDP header. */
+uword unformat_udp_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ udp_header_t * udp;
+ __attribute__((unused)) int old_length;
+ u16 src_port, dst_port;
+
+ /* Allocate space for IP header. */
+ {
+ void * p;
+
+ old_length = vec_len (*result);
+ vec_add2 (*result, p, sizeof (ip4_header_t));
+ udp = p;
+ }
+
+ memset (udp, 0, sizeof (udp[0]));
+ if (unformat (input, "src-port %d dst-port %d",
+ &src_port, &dst_port))
+ {
+ udp->src_port = clib_host_to_net_u16 (src_port);
+ udp->dst_port = clib_host_to_net_u16 (dst_port);
+ return 1;
+ }
+ return 0;
+}
+
+static void
+udp_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_udp_header;
+ n->unformat_buffer = unformat_udp_header;
+ pn->unformat_edit = unformat_pg_udp_header;
+}
+
+clib_error_t * udp_local_init (vlib_main_t * vm)
+{
+ udp_input_runtime_t * rt;
+ udp_main_t * um = &udp_main;
+ int i;
+
+ {
+ clib_error_t * error;
+ error = vlib_call_init_function (vm, udp_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+
+ for (i = 0; i < 2; i++)
+ {
+ um->dst_port_info_by_name[i] = hash_create_string (0, sizeof(uword));
+ um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof(uword));
+ }
+
+ udp_setup_node (vm, udp4_input_node.index);
+ udp_setup_node (vm, udp6_input_node.index);
+
+ rt = vlib_node_get_runtime_data (vm, udp4_input_node.index);
+
+ rt->next_by_dst_port = sparse_vec_new
+ (/* elt bytes */ sizeof (rt->next_by_dst_port[0]),
+ /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
+
+ vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_DROP);
+ vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_PUNT);
+ rt->sparse_index_by_next_index[UDP_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ rt->sparse_index_by_next_index[UDP_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */);
+ foreach_udp4_dst_port
+#undef _
+
+ rt = vlib_node_get_runtime_data (vm, udp6_input_node.index);
+
+ rt->next_by_dst_port = sparse_vec_new
+ (/* elt bytes */ sizeof (rt->next_by_dst_port[0]),
+ /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
+
+ vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_DROP);
+ vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_PUNT);
+ rt->sparse_index_by_next_index[UDP_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ rt->sparse_index_by_next_index[UDP_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */);
+ foreach_udp6_dst_port
+#undef _
+
+ ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index);
+ /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (udp_local_init);
diff --git a/vnet/vnet/ip/udp_packet.h b/vnet/vnet/ip/udp_packet.h
new file mode 100644
index 00000000000..21c30c6eb71
--- /dev/null
+++ b/vnet/vnet/ip/udp_packet.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip4/udp_packet.h: UDP packet format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_udp_packet_h
+#define included_udp_packet_h
+
+typedef struct {
+ /* Source and destination port. */
+ u16 src_port, dst_port;
+
+ /* Length of UDP header plus payload. */
+ u16 length;
+
+ /* Checksum of UDP pseudo-header and data or
+ zero if checksum is disabled. */
+ u16 checksum;
+} udp_header_t;
+
+#endif /* included_udp_packet_h */
+
diff --git a/vnet/vnet/ip/udp_pg.c b/vnet/vnet/ip/udp_pg.c
new file mode 100644
index 00000000000..a33a56294fb
--- /dev/null
+++ b/vnet/vnet/ip/udp_pg.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ip/udp_pg: UDP packet-generator interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/pg/pg.h>
+#include <vnet/ip/ip.h> /* for unformat_udp_udp_port */
+
+#define UDP_PG_EDIT_LENGTH (1 << 0)
+#define UDP_PG_EDIT_CHECKSUM (1 << 1)
+
+always_inline void
+udp_pg_edit_function_inline (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets,
+ u32 flags)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 ip_offset, udp_offset;
+
+ udp_offset = g->start_byte_offset;
+ ip_offset = (g-1)->start_byte_offset;
+
+ while (n_packets >= 1)
+ {
+ vlib_buffer_t * p0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ u32 udp_len0;
+
+ p0 = vlib_get_buffer (vm, packets[0]);
+ n_packets -= 1;
+ packets += 1;
+
+ ip0 = (void *) (p0->data + ip_offset);
+ udp0 = (void *) (p0->data + udp_offset);
+ udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]);
+
+ if (flags & UDP_PG_EDIT_LENGTH)
+ udp0->length =
+ clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0)
+ - ip_offset);
+
+ /* Initialize checksum with header. */
+ if (flags & UDP_PG_EDIT_CHECKSUM)
+ {
+ ip_csum_t sum0;
+
+ sum0 = clib_mem_unaligned (&ip0->src_address, u64);
+
+ sum0 = ip_csum_with_carry
+ (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16)));
+
+ /* Invalidate possibly old checksum. */
+ udp0->checksum = 0;
+
+ sum0 = ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, sum0);
+
+ sum0 = ~ ip_csum_fold (sum0);
+
+ /* Zero checksum means checksumming disabled. */
+ sum0 = sum0 != 0 ? sum0 : 0xffff;
+
+ udp0->checksum = sum0;
+ }
+ }
+}
+
+static void
+udp_pg_edit_function (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_edit_group_t * g,
+ u32 * packets,
+ u32 n_packets)
+{
+ switch (g->edit_function_opaque)
+ {
+ case UDP_PG_EDIT_LENGTH:
+ udp_pg_edit_function_inline (pg, s, g, packets, n_packets,
+ UDP_PG_EDIT_LENGTH);
+ break;
+
+ case UDP_PG_EDIT_CHECKSUM:
+ udp_pg_edit_function_inline (pg, s, g, packets, n_packets,
+ UDP_PG_EDIT_CHECKSUM);
+ break;
+
+ case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH:
+ udp_pg_edit_function_inline (pg, s, g, packets, n_packets,
+ UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH);
+ break;
+
+ default:
+ ASSERT (0);
+ break;
+ }
+}
+
+typedef struct {
+ pg_edit_t src_port, dst_port;
+ pg_edit_t length;
+ pg_edit_t checksum;
+} pg_udp_header_t;
+
+static inline void
+pg_udp_header_init (pg_udp_header_t * p)
+{
+ /* Initialize fields that are not bit fields in the IP header. */
+#define _(f) pg_edit_init (&p->f, udp_header_t, f);
+ _ (src_port);
+ _ (dst_port);
+ _ (length);
+ _ (checksum);
+#undef _
+}
+
+uword
+unformat_pg_udp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_udp_header_t * p;
+ u32 group_index;
+
+ p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t),
+ &group_index);
+ pg_udp_header_init (p);
+
+ /* Defaults. */
+ p->checksum.type = PG_EDIT_UNSPECIFIED;
+ p->length.type = PG_EDIT_UNSPECIFIED;
+
+ if (! unformat (input, "UDP: %U -> %U",
+ unformat_pg_edit,
+ unformat_tcp_udp_port, &p->src_port,
+ unformat_pg_edit,
+ unformat_tcp_udp_port, &p->dst_port))
+ goto error;
+
+ /* Parse options. */
+ while (1)
+ {
+ if (unformat (input, "length %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->length))
+ ;
+
+ else if (unformat (input, "checksum %U",
+ unformat_pg_edit,
+ unformat_pg_number, &p->checksum))
+ ;
+
+ /* Can't parse input: try next protocol level. */
+ else
+ break;
+ }
+
+ {
+ ip_main_t * im = &ip_main;
+ u16 dst_port;
+ tcp_udp_port_info_t * pi;
+
+ pi = 0;
+ if (p->dst_port.type == PG_EDIT_FIXED)
+ {
+ dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO);
+ pi = ip_get_tcp_udp_port_info (im, dst_port);
+ }
+
+ if (pi && pi->unformat_pg_edit
+ && unformat_user (input, pi->unformat_pg_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto error;
+
+ p = pg_get_edit_group (s, group_index);
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED
+ || p->length.type == PG_EDIT_UNSPECIFIED)
+ {
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ g->edit_function = udp_pg_edit_function;
+ g->edit_function_opaque = 0;
+ if (p->checksum.type == PG_EDIT_UNSPECIFIED)
+ g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM;
+ if (p->length.type == PG_EDIT_UNSPECIFIED)
+ g->edit_function_opaque |= UDP_PG_EDIT_LENGTH;
+ }
+
+ return 1;
+ }
+
+ error:
+ /* Free up any edits we may have added. */
+ pg_free_edit_group (s);
+ return 0;
+}
+
diff --git a/vnet/vnet/ipsec/esp.h b/vnet/vnet/ipsec/esp.h
new file mode 100644
index 00000000000..3d46a013b5d
--- /dev/null
+++ b/vnet/vnet/ipsec/esp.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/devices/dpdk/dpdk.h>
+
+#include <openssl/hmac.h>
+#include <openssl/rand.h>
+#include <openssl/evp.h>
+
+typedef struct {
+ u32 spi;
+ u32 seq;
+ u8 data[0];
+} esp_header_t;
+
+typedef struct {
+ u8 pad_length;
+ u8 next_header;
+} esp_footer_t;
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4;
+ esp_header_t esp;
+}) ip4_and_esp_header_t;
+
+typedef CLIB_PACKED (struct {
+ ip6_header_t ip6;
+ esp_header_t esp;
+}) ip6_and_esp_header_t;
+
+typedef struct {
+ const EVP_CIPHER * type;
+} esp_crypto_alg_t;
+
+typedef struct {
+ const EVP_MD * md;
+ u8 trunc_size;
+} esp_integ_alg_t;
+
+
+typedef struct {
+ esp_crypto_alg_t * esp_crypto_algs;
+ esp_integ_alg_t * esp_integ_algs;
+ EVP_CIPHER_CTX encrypt_ctx;
+ EVP_CIPHER_CTX decrypt_ctx;
+ HMAC_CTX hmac_ctx;
+ ipsec_crypto_alg_t last_encrytp_alg;
+ ipsec_crypto_alg_t last_decrytp_alg;
+ ipsec_integ_alg_t last_integ_alg;
+} esp_main_t;
+
+esp_main_t esp_main;
+
+always_inline void
+esp_init()
+{
+ esp_main_t * em = &esp_main;
+
+ memset (em, 0, sizeof (em[0]));
+
+ vec_validate(em->esp_crypto_algs, IPSEC_CRYPTO_N_ALG - 1);
+ em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_128].type = EVP_aes_128_cbc();
+ em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_192].type = EVP_aes_192_cbc();
+ em->esp_crypto_algs[IPSEC_CRYPTO_ALG_AES_CBC_256].type = EVP_aes_256_cbc();
+
+ vec_validate(em->esp_integ_algs, IPSEC_INTEG_N_ALG - 1);
+ esp_integ_alg_t * i;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA1_96];
+ i->md = EVP_sha1();
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_96];
+ i->md = EVP_sha256();
+ i->trunc_size = 12;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_256_128];
+ i->md = EVP_sha256();
+ i->trunc_size = 16;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_384_192];
+ i->md = EVP_sha384();
+ i->trunc_size = 24;
+
+ i = &em->esp_integ_algs[IPSEC_INTEG_ALG_SHA_512_256];
+ i->md = EVP_sha512();
+ i->trunc_size = 32;
+
+ EVP_CIPHER_CTX_init(&(em->encrypt_ctx));
+ EVP_CIPHER_CTX_init(&(em->decrypt_ctx));
+ HMAC_CTX_init(&(em->hmac_ctx));
+}
+
+always_inline unsigned int
+hmac_calc(ipsec_integ_alg_t alg,
+ u8 * key,
+ int key_len,
+ u8 * data,
+ int data_len,
+ u8 * signature,
+ u8 use_esn,
+ u32 seq_hi)
+{
+ esp_main_t * em = &esp_main;
+ HMAC_CTX * ctx = &(em->hmac_ctx);
+ const EVP_MD * md = NULL;
+ unsigned int len;
+
+ ASSERT(alg < IPSEC_INTEG_N_ALG);
+
+ if (PREDICT_FALSE(em->esp_integ_algs[alg].md == 0))
+ return 0;
+
+ if (PREDICT_FALSE(alg != em->last_integ_alg)) {
+ md = em->esp_integ_algs[alg].md;
+ em->last_integ_alg = alg;
+ }
+
+ HMAC_Init(ctx, key, key_len, md);
+
+ HMAC_Update(ctx, data, data_len);
+
+ if (PREDICT_TRUE(use_esn))
+ HMAC_Update(ctx, (u8 *) &seq_hi, sizeof(seq_hi));
+ HMAC_Final(ctx, signature, &len);
+
+ return em->esp_integ_algs[alg].trunc_size;
+}
+
diff --git a/vnet/vnet/ipsec/esp_decrypt.c b/vnet/vnet/ipsec/esp_decrypt.c
new file mode 100644
index 00000000000..ad511b0fba3
--- /dev/null
+++ b/vnet/vnet/ipsec/esp_decrypt.c
@@ -0,0 +1,424 @@
+/*
+ * esp_decrypt.c : IPSec ESP decrypt node
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+#define ESP_WINDOW_SIZE 64
+
+#define foreach_esp_decrypt_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input")
+
+#define _(v, s) ESP_DECRYPT_NEXT_##v,
+typedef enum {
+ foreach_esp_decrypt_next
+#undef _
+ ESP_DECRYPT_N_NEXT,
+} esp_decrypt_next_t;
+
+
+#define foreach_esp_decrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(NO_BUFFER, "No buffer (packed dropped)") \
+ _(DECRYPTION_FAILED, "ESP decryption failed") \
+ _(INTEG_ERROR, "Integrity check failed") \
+ _(REPLAY, "SA replayed packet")
+
+
+typedef enum {
+#define _(sym,str) ESP_DECRYPT_ERROR_##sym,
+ foreach_esp_decrypt_error
+#undef _
+ ESP_DECRYPT_N_ERROR,
+} esp_decrypt_error_t;
+
+static char * esp_decrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_decrypt_error
+#undef _
+};
+
+typedef struct {
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_decrypt_trace_t;
+
+/* packet trace format function */
+static u8 * format_esp_decrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_decrypt_trace_t * t = va_arg (*args, esp_decrypt_trace_t *);
+
+ s = format (s, "esp: crypto %U integrity %U",
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+always_inline void
+esp_decrypt_aes_cbc(ipsec_crypto_alg_t alg,
+ u8 * in,
+ u8 * out,
+ size_t in_len,
+ u8 * key,
+ u8 * iv)
+{
+ esp_main_t * em = &esp_main;
+ EVP_CIPHER_CTX * ctx = &(em->decrypt_ctx);
+ const EVP_CIPHER * cipher = NULL;
+ int out_len;
+
+ ASSERT(alg < IPSEC_CRYPTO_N_ALG);
+
+ if (PREDICT_FALSE(em->esp_crypto_algs[alg].type == 0))
+ return;
+
+ if (PREDICT_FALSE(alg != em->last_decrytp_alg)) {
+ cipher = em->esp_crypto_algs[alg].type;
+ em->last_decrytp_alg = alg;
+ }
+
+ EVP_DecryptInit_ex(ctx, cipher, NULL, key, iv);
+
+ EVP_DecryptUpdate(ctx, out, &out_len, in, in_len);
+ EVP_DecryptFinal_ex(ctx, out + out_len, &out_len);
+}
+
+always_inline int
+esp_replay_check (ipsec_sa_t * sa, u32 seq)
+{
+ u32 diff;
+
+ if (PREDICT_TRUE(seq > sa->last_seq))
+ return 0;
+
+ diff = sa->last_seq - seq;
+
+ if (ESP_WINDOW_SIZE > diff)
+ return (sa->replay_window & (1ULL << diff)) ? 1 : 0;
+ else
+ return 1;
+
+ return 0;
+}
+
+always_inline int
+esp_replay_check_esn (ipsec_sa_t * sa, u32 seq)
+{
+ u32 tl = sa->last_seq;
+ u32 th = sa->last_seq_hi;
+ u32 diff = tl - seq;
+
+ if (PREDICT_TRUE(tl >= (ESP_WINDOW_SIZE - 1)))
+ {
+ if (seq >= (tl - ESP_WINDOW_SIZE + 1))
+ {
+ sa->seq_hi = th;
+ if (seq <= tl)
+ return (sa->replay_window & (1ULL << diff)) ? 1 : 0;
+ else
+ return 0;
+ }
+ else
+ {
+ sa->seq_hi = th + 1;
+ return 0;
+ }
+ }
+ else
+ {
+ if (seq >= (tl - ESP_WINDOW_SIZE + 1))
+ {
+ sa->seq_hi = th - 1;
+ return (sa->replay_window & (1ULL << diff)) ? 1 : 0;
+ }
+ else
+ {
+ sa->seq_hi = th;
+ if (seq <= tl)
+ return (sa->replay_window & (1ULL << diff)) ? 1 : 0;
+ else
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+always_inline void
+esp_replay_advance (ipsec_sa_t * sa, u32 seq)
+{
+ u32 pos;
+
+ if (seq > sa->last_seq)
+ {
+ pos = seq - sa->last_seq;
+ if (pos < ESP_WINDOW_SIZE)
+ sa->replay_window = ((sa->replay_window) << pos) | 1;
+ else
+ sa->replay_window = 1;
+ sa->last_seq = seq;
+ }
+ else
+ {
+ pos = sa->last_seq - seq;
+ sa->replay_window |= (1ULL << pos);
+ }
+}
+
+always_inline void
+esp_replay_advance_esn (ipsec_sa_t * sa, u32 seq)
+{
+ int wrap = sa->seq_hi - sa->last_seq_hi;
+ u32 pos;
+
+ if (wrap == 0 && seq > sa->last_seq)
+ {
+ pos = seq - sa->last_seq;
+ if (pos < ESP_WINDOW_SIZE)
+ sa->replay_window = ((sa->replay_window) << pos) | 1;
+ else
+ sa->replay_window = 1;
+ sa->last_seq = seq;
+ }
+ else if (wrap > 0)
+ {
+ pos = ~seq + sa->last_seq + 1;
+ if (pos < ESP_WINDOW_SIZE)
+ sa->replay_window = ((sa->replay_window) << pos) | 1;
+ else
+ sa->replay_window = 1;
+ sa->last_seq = seq;
+ sa->last_seq_hi = sa->seq_hi;
+ }
+ else if (wrap < 0)
+ {
+ pos = ~seq + sa->last_seq + 1;
+ sa->replay_window |= (1ULL << pos);
+ }
+ else
+ {
+ pos = sa->last_seq - seq;
+ sa->replay_window |= (1ULL << pos);
+ }
+}
+
+static uword
+esp_decrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, next_index, *to_next;
+ ipsec_main_t *im = &ipsec_main;
+ esp_main_t *em = &esp_main;
+ u32 * recycle = 0;
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ ipsec_alloc_empty_buffers(vm, im);
+
+ if (PREDICT_FALSE(vec_len (im->empty_buffers) < n_left_from)){
+ vlib_node_increment_counter (vm, esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_NO_BUFFER, n_left_from);
+ goto free_buffers_and_exit;
+ }
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 i_bi0, o_bi0 = (u32) ~0, next0;
+ vlib_buffer_t * i_b0;
+ vlib_buffer_t * o_b0 = 0;
+ esp_header_t * esp0;
+ ipsec_sa_t * sa0;
+ u32 sa_index0 = ~0;
+ u32 seq;
+
+ i_bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ next0 = ESP_DECRYPT_NEXT_DROP;
+
+ i_b0 = vlib_get_buffer (vm, i_bi0);
+ esp0 = vlib_buffer_get_current (i_b0);
+
+ sa_index0 = vnet_buffer(i_b0)->output_features.ipsec_sad_index;
+ sa0 = pool_elt_at_index (im->sad, sa_index0);
+
+ seq = clib_host_to_net_u32(esp0->seq);
+
+ /* anti-replay check */
+ if (sa0->use_anti_replay)
+ {
+ int rv = 0;
+
+ if (PREDICT_TRUE(sa0->use_esn))
+ rv = esp_replay_check_esn(sa0, seq);
+ else
+ rv = esp_replay_check(sa0, seq);
+
+ if (PREDICT_FALSE(rv))
+ {
+ clib_warning("anti-replay SPI %u seq %u", sa0->spi, seq);
+ vlib_node_increment_counter (vm, esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_REPLAY, 1);
+ o_bi0 = i_bi0;
+ goto trace;
+ }
+ }
+
+ if (PREDICT_TRUE(sa0->integ_alg != IPSEC_INTEG_ALG_NONE))
+ {
+ u8 sig[64];
+ int icv_size = em->esp_integ_algs[sa0->integ_alg].trunc_size;
+ memset(sig, 0, sizeof(sig));
+ u8 * icv = vlib_buffer_get_current (i_b0) + i_b0->current_length - icv_size;
+ i_b0->current_length -= icv_size;
+
+ hmac_calc(sa0->integ_alg, sa0->integ_key, sa0->integ_key_len,
+ (u8 *) esp0, i_b0->current_length, sig, sa0->use_esn,
+ sa0->seq_hi);
+
+ if (PREDICT_FALSE(memcmp(icv, sig, icv_size)))
+ {
+ vlib_node_increment_counter (vm, esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_INTEG_ERROR, 1);
+ o_bi0 = i_bi0;
+ goto trace;
+ }
+ }
+
+ if (PREDICT_TRUE(sa0->use_anti_replay))
+ {
+ if (PREDICT_TRUE(sa0->use_esn))
+ esp_replay_advance_esn(sa0, seq);
+ else
+ esp_replay_advance(sa0, seq);
+ }
+
+ /* grab free buffer */
+ uword last_empty_buffer = vec_len (im->empty_buffers) - 1;
+ o_bi0 = im->empty_buffers[last_empty_buffer];
+ o_b0 = vlib_get_buffer (vm, o_bi0);
+ vlib_prefetch_buffer_with_index (vm, im->empty_buffers[last_empty_buffer-1], STORE);
+ _vec_len (im->empty_buffers) = last_empty_buffer;
+
+ /* add old buffer to the recycle list */
+ vec_add1(recycle, i_bi0);
+
+ if (sa0->crypto_alg >= IPSEC_CRYPTO_ALG_AES_CBC_128 &&
+ sa0->crypto_alg <= IPSEC_CRYPTO_ALG_AES_CBC_256) {
+ const int BLOCK_SIZE = 16;
+ const int IV_SIZE = 16;
+ esp_footer_t * f0;
+
+ int blocks = (i_b0->current_length - sizeof (esp_header_t) - IV_SIZE) / BLOCK_SIZE;
+
+ o_b0->current_data = sizeof(ethernet_header_t);
+
+ esp_decrypt_aes_cbc(sa0->crypto_alg,
+ esp0->data + IV_SIZE,
+ (u8 *) vlib_buffer_get_current (o_b0),
+ BLOCK_SIZE * blocks,
+ sa0->crypto_key,
+ esp0->data);
+
+ o_b0->current_length = (blocks * 16) - 2;
+ o_b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+ f0 = (esp_footer_t *) ((u8 *) vlib_buffer_get_current (o_b0) + o_b0->current_length);
+ o_b0->current_length -= f0->pad_length;
+ if (PREDICT_TRUE(f0->next_header == IP_PROTOCOL_IP_IN_IP))
+ next0 = ESP_DECRYPT_NEXT_IP4_INPUT;
+ else if (f0->next_header == IP_PROTOCOL_IPV6)
+ next0 = ESP_DECRYPT_NEXT_IP6_INPUT;
+ else
+ {
+ clib_warning("next header: 0x%x", f0->next_header);
+ vlib_node_increment_counter (vm, esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_DECRYPTION_FAILED,
+ 1);
+ o_b0 = 0;
+ goto trace;
+ }
+
+ to_next[0] = o_bi0;
+ to_next += 1;
+
+ vnet_buffer (o_b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ }
+
+trace:
+ if (PREDICT_FALSE(i_b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ if (o_b0) {
+ o_b0->flags |= VLIB_BUFFER_IS_TRACED;
+ o_b0->trace_index = i_b0->trace_index;
+ }
+ esp_decrypt_trace_t *tr = vlib_add_trace (vm, node, o_b0, sizeof (*tr));
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, o_bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, esp_decrypt_node.index,
+ ESP_DECRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+
+free_buffers_and_exit:
+ vlib_buffer_free (vm, recycle, vec_len(recycle));
+ vec_free(recycle);
+ return from_frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (esp_decrypt_node) = {
+ .function = esp_decrypt_node_fn,
+ .name = "esp-decrypt",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_decrypt_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_decrypt_error_strings),
+ .error_strings = esp_decrypt_error_strings,
+
+ .n_next_nodes = ESP_DECRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_DECRYPT_NEXT_##s] = n,
+ foreach_esp_decrypt_next
+#undef _
+ },
+};
+
diff --git a/vnet/vnet/ipsec/esp_encrypt.c b/vnet/vnet/ipsec/esp_encrypt.c
new file mode 100644
index 00000000000..68add4c3d57
--- /dev/null
+++ b/vnet/vnet/ipsec/esp_encrypt.c
@@ -0,0 +1,386 @@
+/*
+ * esp_encrypt.c : IPSec ESP encrypt node
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+#define ESP_SEQ_MAX (4294967295UL)
+
+#define foreach_esp_encrypt_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input") \
+_(INTERFACE_OUTPUT, "interface-output")
+
+#define _(v, s) ESP_ENCRYPT_NEXT_##v,
+typedef enum {
+ foreach_esp_encrypt_next
+#undef _
+ ESP_ENCRYPT_N_NEXT,
+} esp_encrypt_next_t;
+
+#define foreach_esp_encrypt_error \
+ _(RX_PKTS, "ESP pkts received") \
+ _(NO_BUFFER, "No buffer (packet dropped)") \
+ _(DECRYPTION_FAILED, "ESP encryption failed") \
+ _(SEQ_CYCLED, "sequence number cycled")
+
+
+typedef enum {
+#define _(sym,str) ESP_ENCRYPT_ERROR_##sym,
+ foreach_esp_encrypt_error
+#undef _
+ ESP_ENCRYPT_N_ERROR,
+} esp_encrypt_error_t;
+
+static char * esp_encrypt_error_strings[] = {
+#define _(sym,string) string,
+ foreach_esp_encrypt_error
+#undef _
+};
+
+vlib_node_registration_t esp_encrypt_node;
+
+typedef struct {
+ u32 spi;
+ u32 seq;
+ ipsec_crypto_alg_t crypto_alg;
+ ipsec_integ_alg_t integ_alg;
+} esp_encrypt_trace_t;
+
+/* packet trace format function */
+static u8 * format_esp_encrypt_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ esp_encrypt_trace_t * t = va_arg (*args, esp_encrypt_trace_t *);
+
+ s = format (s, "esp: spi %u seq %u crypto %U integrity %U",
+ t->spi, t->seq,
+ format_ipsec_crypto_alg, t->crypto_alg,
+ format_ipsec_integ_alg, t->integ_alg);
+ return s;
+}
+
+always_inline void
+esp_encrypt_aes_cbc(ipsec_crypto_alg_t alg,
+ u8 * in,
+ u8 * out,
+ size_t in_len,
+ u8 * key,
+ u8 * iv)
+{
+ esp_main_t * em = &esp_main;
+ EVP_CIPHER_CTX * ctx = &(em->encrypt_ctx);
+ const EVP_CIPHER * cipher = NULL;
+ int out_len;
+
+ ASSERT(alg < IPSEC_CRYPTO_N_ALG);
+
+ if (PREDICT_FALSE(em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE))
+ return;
+
+ if (PREDICT_FALSE(alg != em->last_encrytp_alg)) {
+ cipher = em->esp_crypto_algs[alg].type;
+ em->last_encrytp_alg = alg;
+ }
+
+ EVP_EncryptInit_ex(ctx, cipher, NULL, key, iv);
+
+ EVP_EncryptUpdate(ctx, out, &out_len, in, in_len);
+ EVP_EncryptFinal_ex(ctx, out + out_len, &out_len);
+}
+
+always_inline int
+esp_seq_advance (ipsec_sa_t * sa)
+{
+ if (PREDICT_TRUE(sa->use_esn))
+ {
+ if (PREDICT_FALSE(sa->seq == ESP_SEQ_MAX))
+ {
+ if (PREDICT_FALSE(sa->use_anti_replay && sa->seq_hi == ESP_SEQ_MAX))
+ return 1;
+ sa->seq_hi++;
+ }
+ sa->seq++;
+ }
+ else
+ {
+ if (PREDICT_FALSE(sa->use_anti_replay && sa->seq == ESP_SEQ_MAX))
+ return 1;
+ sa->seq++;
+ }
+
+ return 0;
+}
+
+static uword
+esp_encrypt_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, *from, * to_next = 0, next_index;
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ ipsec_main_t *im = &ipsec_main;
+ u32 * recycle = 0;
+
+ ipsec_alloc_empty_buffers(vm, im);
+
+ if (PREDICT_FALSE(vec_len (im->empty_buffers) < n_left_from)){
+ vlib_node_increment_counter (vm, esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_NO_BUFFER, n_left_from);
+ clib_warning("no enough empty buffers. discarding frame");
+ goto free_buffers_and_exit;
+ }
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 i_bi0, o_bi0, next0;
+ vlib_buffer_t * i_b0, *o_b0 = 0;
+ u32 sa_index0;
+ ipsec_sa_t * sa0;
+ ip4_and_esp_header_t * ih0, * oh0 = 0;
+ ip6_and_esp_header_t * ih6_0, * oh6_0 = 0;
+ uword last_empty_buffer;
+ esp_header_t * o_esp0;
+ esp_footer_t *f0;
+ u8 is_ipv6;
+ u8 ip_hdr_size;
+ u8 next_hdr_type;
+
+ i_bi0 = from[0];
+ from += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ next0 = ESP_ENCRYPT_NEXT_DROP;
+
+ i_b0 = vlib_get_buffer (vm, i_bi0);
+ sa_index0 = vnet_buffer(i_b0)->output_features.ipsec_sad_index;
+ sa0 = pool_elt_at_index(im->sad, sa_index0);
+
+ if (PREDICT_FALSE(esp_seq_advance(sa0)))
+ {
+ clib_warning("sequence number counter has cycled SPI %u", sa0->spi);
+ vlib_node_increment_counter (vm, esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_SEQ_CYCLED, 1);
+ //TODO: rekey SA
+ o_bi0 = i_bi0;
+ goto trace;
+ }
+
+ /* grab free buffer */
+ last_empty_buffer = vec_len (im->empty_buffers) - 1;
+ o_bi0 = im->empty_buffers[last_empty_buffer];
+ o_b0 = vlib_get_buffer (vm, o_bi0);
+ o_b0->current_data = sizeof(ethernet_header_t);
+ ih0 = vlib_buffer_get_current (i_b0);
+ vlib_prefetch_buffer_with_index (vm, im->empty_buffers[last_empty_buffer-1], STORE);
+ _vec_len (im->empty_buffers) = last_empty_buffer;
+ to_next[0] = o_bi0;
+ to_next += 1;
+
+ /* add old buffer to the recycle list */
+ vec_add1(recycle, i_bi0);
+
+ /* is ipv6 */
+ if (PREDICT_FALSE((ih0->ip4.ip_version_and_header_length & 0xF0 ) == 0x60))
+ {
+ is_ipv6 = 1;
+ ih6_0 = vlib_buffer_get_current (i_b0);
+ ip_hdr_size = sizeof(ip6_header_t);
+ next_hdr_type = IP_PROTOCOL_IPV6;
+ oh6_0 = vlib_buffer_get_current (o_b0);
+ o_esp0 = vlib_buffer_get_current (o_b0) + sizeof(ip6_header_t);
+
+ oh6_0->ip6.ip_version_traffic_class_and_flow_label =
+ ih6_0->ip6.ip_version_traffic_class_and_flow_label;
+ oh6_0->ip6.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh6_0->ip6.hop_limit = 254;
+ oh6_0->esp.spi = clib_net_to_host_u32(sa0->spi);
+ oh6_0->esp.seq = clib_net_to_host_u32(sa0->seq);
+ }
+ else
+ {
+ is_ipv6 = 0;
+ ip_hdr_size = sizeof(ip4_header_t);
+ next_hdr_type = IP_PROTOCOL_IP_IN_IP;
+ oh0 = vlib_buffer_get_current (o_b0);
+ o_esp0 = vlib_buffer_get_current (o_b0) + sizeof(ip4_header_t);
+
+ oh0->ip4.ip_version_and_header_length = 0x45;
+ oh0->ip4.tos = ih0->ip4.tos;
+ oh0->ip4.fragment_id = 0;
+ oh0->ip4.flags_and_fragment_offset = 0;
+ oh0->ip4.ttl = 254;
+ oh0->ip4.protocol = IP_PROTOCOL_IPSEC_ESP;
+ oh0->esp.spi = clib_net_to_host_u32(sa0->spi);
+ oh0->esp.seq = clib_net_to_host_u32(sa0->seq);
+ }
+
+ if (PREDICT_TRUE(sa0->is_tunnel && !sa0->is_tunnel_ip6))
+ {
+ oh0->ip4.src_address.as_u32 = sa0->tunnel_src_addr.ip4.as_u32;
+ oh0->ip4.dst_address.as_u32 = sa0->tunnel_dst_addr.ip4.as_u32;
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP4_INPUT;
+ vnet_buffer (o_b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ }
+ else if(sa0->is_tunnel && sa0->is_tunnel_ip6)
+ {
+ oh6_0->ip6.src_address.as_u64[0] = sa0->tunnel_src_addr.ip6.as_u64[0];
+ oh6_0->ip6.src_address.as_u64[1] = sa0->tunnel_src_addr.ip6.as_u64[1];
+ oh6_0->ip6.dst_address.as_u64[0] = sa0->tunnel_dst_addr.ip6.as_u64[0];
+ oh6_0->ip6.dst_address.as_u64[1] = sa0->tunnel_dst_addr.ip6.as_u64[1];
+
+ /* in tunnel mode send it back to FIB */
+ next0 = ESP_ENCRYPT_NEXT_IP6_INPUT;
+ vnet_buffer (o_b0)->sw_if_index[VLIB_TX] = (u32)~0;
+ }
+ else
+ {
+ next0 = ESP_ENCRYPT_NEXT_INTERFACE_OUTPUT;
+ vnet_buffer (o_b0)->sw_if_index[VLIB_TX] =
+ vnet_buffer (i_b0)->sw_if_index[VLIB_TX];
+ }
+
+ ASSERT(sa0->crypto_alg < IPSEC_CRYPTO_N_ALG);
+
+ if (PREDICT_TRUE(sa0->crypto_alg != IPSEC_CRYPTO_ALG_NONE)) {
+
+ const int BLOCK_SIZE = 16;
+ const int IV_SIZE = 16;
+ int blocks = 1 + (i_b0->current_length + 1) / BLOCK_SIZE;
+
+ /* pad packet in input buffer */
+ u8 pad_bytes = BLOCK_SIZE * blocks - 2 - i_b0->current_length;
+ u8 i;
+ u8 * padding = vlib_buffer_get_current (i_b0) + i_b0->current_length;
+ i_b0->current_length = BLOCK_SIZE * blocks;
+ for (i = 0; i < pad_bytes; ++i)
+ {
+ padding[i] = i + 1;
+ }
+ f0 = vlib_buffer_get_current (i_b0) + i_b0->current_length - 2;
+ f0->pad_length = pad_bytes;
+ f0->next_header = next_hdr_type;
+
+ o_b0->current_length = ip_hdr_size + sizeof(esp_header_t) +
+ BLOCK_SIZE * blocks + IV_SIZE;
+
+ vnet_buffer (o_b0)->sw_if_index[VLIB_RX] =
+ vnet_buffer (i_b0)->sw_if_index[VLIB_RX];
+ o_b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+ u8 iv[16];
+ RAND_bytes(iv, sizeof(iv));
+
+ memcpy((u8 *) vlib_buffer_get_current (o_b0) + ip_hdr_size +
+ sizeof(esp_header_t), iv, 16 );
+
+ esp_encrypt_aes_cbc(sa0->crypto_alg,
+ (u8 *) vlib_buffer_get_current (i_b0),
+ (u8 *) vlib_buffer_get_current (o_b0) +
+ ip_hdr_size + sizeof(esp_header_t) + IV_SIZE,
+ BLOCK_SIZE * blocks,
+ sa0->crypto_key,
+ iv);
+ }
+
+ o_b0->current_length += hmac_calc(sa0->integ_alg, sa0->integ_key,
+ sa0->integ_key_len,
+ (u8 *) o_esp0,
+ o_b0->current_length - ip_hdr_size,
+ vlib_buffer_get_current (o_b0) +
+ o_b0->current_length,
+ sa0->use_esn,
+ sa0->seq_hi);
+
+
+ if (PREDICT_FALSE(is_ipv6))
+ {
+ oh6_0->ip6.payload_length = clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain (vm, o_b0) - sizeof(ip6_header_t));
+ }
+ else
+ {
+ oh0->ip4.length = clib_host_to_net_u16 (
+ vlib_buffer_length_in_chain (vm, o_b0));
+ oh0->ip4.checksum = ip4_header_checksum (&oh0->ip4);
+ }
+
+trace:
+ if (PREDICT_FALSE(i_b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ if (o_b0) {
+ o_b0->flags |= VLIB_BUFFER_IS_TRACED;
+ o_b0->trace_index = i_b0->trace_index;
+ }
+ esp_encrypt_trace_t *tr = vlib_add_trace (vm, node, o_b0, sizeof (*tr));
+ tr->spi = sa0->spi;
+ tr->seq = sa0->seq - 1;
+ tr->crypto_alg = sa0->crypto_alg;
+ tr->integ_alg = sa0->integ_alg;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, o_bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, esp_encrypt_node.index,
+ ESP_ENCRYPT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+
+free_buffers_and_exit:
+ vlib_buffer_free (vm, recycle, vec_len(recycle));
+ vec_free(recycle);
+ return from_frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (esp_encrypt_node) = {
+ .function = esp_encrypt_node_fn,
+ .name = "esp-encrypt",
+ .vector_size = sizeof (u32),
+ .format_trace = format_esp_encrypt_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(esp_encrypt_error_strings),
+ .error_strings = esp_encrypt_error_strings,
+
+ .n_next_nodes = ESP_ENCRYPT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [ESP_ENCRYPT_NEXT_##s] = n,
+ foreach_esp_encrypt_next
+#undef _
+ },
+};
+
diff --git a/vnet/vnet/ipsec/ikev2.c b/vnet/vnet/ipsec/ikev2.c
new file mode 100644
index 00000000000..ab2277f5f90
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2.c
@@ -0,0 +1,2142 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/udp.h>
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/ikev2.h>
+#include <vnet/ipsec/ikev2_priv.h>
+
+static int ikev2_delete_tunnel_interface(vnet_main_t * vnm,
+ ikev2_sa_t *sa,
+ ikev2_child_sa_t * child);
+
+static void hexdump(u8 buffer[], int len)
+{
+#define HEXDUMP_LINE_LEN 16
+ int i;
+ char s[HEXDUMP_LINE_LEN+1];
+ bzero(s, HEXDUMP_LINE_LEN+1);
+
+ for(i=0; i < len; i++) {
+ if (!(i%HEXDUMP_LINE_LEN)) {
+ if (s[0])
+ printf("[%s]",s);
+ printf("\n%05x: ", i);
+ bzero(s, HEXDUMP_LINE_LEN);
+ }
+ s[i%HEXDUMP_LINE_LEN]=isprint(buffer[i])?buffer[i]:'.';
+ printf("%02x ", buffer[i]);
+ }
+ while(i++%HEXDUMP_LINE_LEN)
+ printf(" ");
+
+ printf("[%s]\n", s);
+}
+
+#define ikev2_set_state(sa, v) do { \
+ (sa)->state = v; \
+ clib_warning("sa state changed to " #v); \
+ } while(0);
+
+typedef struct {
+ u32 next_index;
+ u32 sw_if_index;
+} ikev2_trace_t;
+
+static u8 * format_ikev2_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ikev2_trace_t * t = va_arg (*args, ikev2_trace_t *);
+
+ s = format (s, "ikev2: sw_if_index %d, next index %d",
+ t->sw_if_index, t->next_index);
+ return s;
+}
+
+vlib_node_registration_t ikev2_node;
+
+#define foreach_ikev2_error \
+_(PROCESSED, "IKEv2 packets processed") \
+_(IKE_SA_INIT_RETRANSMIT, "IKE_SA_INIT retransmit ") \
+_(IKE_SA_INIT_IGNORE, "IKE_SA_INIT ignore (IKE SA already auth)") \
+_(IKE_REQ_RETRANSMIT, "IKE request retransmit") \
+_(IKE_REQ_IGNORE, "IKE request ignore (old msgid)") \
+_(NOT_IKEV2, "Non IKEv2 packets received")
+
+typedef enum {
+#define _(sym,str) IKEV2_ERROR_##sym,
+ foreach_ikev2_error
+#undef _
+ IKEV2_N_ERROR,
+} ikev2_error_t;
+
+static char * ikev2_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ikev2_error
+#undef _
+};
+
+typedef enum {
+ IKEV2_NEXT_IP4_LOOKUP,
+ IKEV2_NEXT_ERROR_DROP,
+ IKEV2_N_NEXT,
+} ikev2_next_t;
+
+static ikev2_sa_transform_t *
+ikev2_find_transform_data(ikev2_sa_transform_t * t)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_sa_transform_t * td;
+
+ vec_foreach(td, km->supported_transforms)
+ {
+ if (td->type != t->type)
+ continue;
+
+ if (td->transform_id != t->transform_id)
+ continue;
+
+ if (td->type == IKEV2_TRANSFORM_TYPE_ENCR)
+ {
+ if (vec_len(t->attrs) != 4 || t->attrs[0] != 0x80 || t->attrs[1] != 14)
+ continue;
+
+ if (((t->attrs[2] << 8 | t->attrs[3]) / 8) != td->key_len)
+ continue;
+ }
+ return td;
+ }
+ return 0;
+}
+
+static ikev2_sa_proposal_t *
+ikev2_select_proposal(ikev2_sa_proposal_t *proposals, ikev2_protocol_id_t prot_id)
+{
+ ikev2_sa_proposal_t * rv = 0;
+ ikev2_sa_proposal_t * proposal;
+ ikev2_sa_transform_t * transform, * new_t;
+ u8 mandatory_bitmap, optional_bitmap;
+
+ if (prot_id == IKEV2_PROTOCOL_IKE)
+ {
+ mandatory_bitmap = (1 << IKEV2_TRANSFORM_TYPE_ENCR) |
+ (1 << IKEV2_TRANSFORM_TYPE_PRF) |
+ (1 << IKEV2_TRANSFORM_TYPE_INTEG) |
+ (1 << IKEV2_TRANSFORM_TYPE_DH);
+ optional_bitmap = mandatory_bitmap;
+ }
+ else if (prot_id == IKEV2_PROTOCOL_ESP)
+ {
+ mandatory_bitmap = (1 << IKEV2_TRANSFORM_TYPE_ENCR) |
+ (1 << IKEV2_TRANSFORM_TYPE_ESN);
+ optional_bitmap = mandatory_bitmap |
+ (1 << IKEV2_TRANSFORM_TYPE_INTEG) |
+ (1 << IKEV2_TRANSFORM_TYPE_DH);
+ }
+ else if (prot_id == IKEV2_PROTOCOL_AH)
+ {
+ mandatory_bitmap = (1 << IKEV2_TRANSFORM_TYPE_INTEG) |
+ (1 << IKEV2_TRANSFORM_TYPE_ESN);
+ optional_bitmap = mandatory_bitmap |
+ (1 << IKEV2_TRANSFORM_TYPE_DH);
+ }
+ else
+ return 0;
+
+ vec_add2(rv, proposal, 1);
+
+ vec_foreach(proposal, proposals)
+ {
+ u8 bitmap = 0;
+ if (proposal->protocol_id != prot_id)
+ continue;
+
+ vec_foreach(transform, proposal->transforms)
+ {
+ if ((1 << transform->type) & bitmap)
+ continue;
+
+ if (ikev2_find_transform_data(transform))
+ {
+ bitmap |= 1 << transform->type;
+ vec_add2(rv->transforms, new_t, 1);
+ memcpy(new_t, transform, sizeof(*new_t));
+ new_t->attrs = vec_dup(transform->attrs);
+ }
+ }
+
+ clib_warning("bitmap is %x mandatory is %x optional is %x",
+ bitmap, mandatory_bitmap, optional_bitmap);
+
+ if ((bitmap & mandatory_bitmap) == mandatory_bitmap &&
+ (bitmap & ~optional_bitmap) == 0)
+ {
+ rv->proposal_num = proposal->proposal_num;
+ rv->protocol_id = proposal->protocol_id;
+ RAND_bytes((u8 *) &rv->spi, sizeof(rv->spi));
+ goto done;
+ }
+ else
+ {
+ vec_free(rv->transforms);
+ }
+ }
+
+ vec_free(rv);
+done:
+ return rv;
+}
+
+ikev2_sa_transform_t *
+ikev2_sa_get_td_for_type(ikev2_sa_proposal_t * p, ikev2_transform_type_t type)
+{
+ ikev2_sa_transform_t * t;
+
+ if (!p)
+ return 0;
+
+ vec_foreach(t, p->transforms)
+ {
+ if (t->type == type)
+ return ikev2_find_transform_data(t);
+ }
+ return 0;
+}
+
+ikev2_child_sa_t *
+ikev2_sa_get_child(ikev2_sa_t * sa, u32 spi, ikev2_protocol_id_t prot_id)
+{
+ ikev2_child_sa_t * c;
+ vec_foreach(c, sa->childs)
+ {
+ if (c->i_proposals[0].spi == spi && c->i_proposals[0].protocol_id == prot_id)
+ return c;
+ }
+
+ return 0;
+}
+
+void
+ikev2_sa_free_proposal_vector(ikev2_sa_proposal_t ** v)
+{
+ ikev2_sa_proposal_t * p;
+ ikev2_sa_transform_t * t;
+
+ if (!*v)
+ return;
+
+ vec_foreach(p, *v) {
+ vec_foreach(t, p->transforms) {
+ vec_free(t->attrs);
+ }
+ vec_free(p->transforms);
+ }
+ vec_free(*v);
+};
+
+static void
+ikev2_sa_free_all_child_sa(ikev2_child_sa_t ** childs)
+{
+ ikev2_child_sa_t * c;
+ vec_foreach(c, *childs)
+ {
+ ikev2_sa_free_proposal_vector(&c->r_proposals);
+ ikev2_sa_free_proposal_vector(&c->i_proposals);
+ vec_free(c->sk_ai);
+ vec_free(c->sk_ar);
+ vec_free(c->sk_ei);
+ vec_free(c->sk_er);
+ }
+
+ vec_free(*childs);
+}
+
+static void
+ikev2_sa_del_child_sa(ikev2_sa_t * sa, ikev2_child_sa_t * child)
+{
+ ikev2_sa_free_proposal_vector(&child->r_proposals);
+ ikev2_sa_free_proposal_vector(&child->i_proposals);
+ vec_free(child->sk_ai);
+ vec_free(child->sk_ar);
+ vec_free(child->sk_ei);
+ vec_free(child->sk_er);
+
+ vec_del1(sa->childs, child - sa->childs);
+}
+
+static void
+ikev2_sa_free_all_vec(ikev2_sa_t *sa)
+{
+ vec_free(sa->i_nonce);
+ vec_free(sa->i_dh_data);
+ vec_free(sa->dh_shared_key);
+
+ ikev2_sa_free_proposal_vector(&sa->r_proposals);
+ ikev2_sa_free_proposal_vector(&sa->i_proposals);
+
+ vec_free(sa->sk_d);
+ vec_free(sa->sk_ai);
+ vec_free(sa->sk_ar);
+ vec_free(sa->sk_ei);
+ vec_free(sa->sk_er);
+ vec_free(sa->sk_pi);
+ vec_free(sa->sk_pr);
+
+ vec_free(sa->i_id.data);
+ vec_free(sa->i_auth.data);
+ vec_free(sa->r_id.data);
+ vec_free(sa->r_auth.data);
+ if (sa->r_auth.key)
+ EVP_PKEY_free(sa->r_auth.key);
+
+ vec_free(sa->del);
+
+ ikev2_sa_free_all_child_sa(&sa->childs);
+}
+
+static void
+ikev2_delete_sa(ikev2_sa_t *sa)
+{
+ ikev2_main_t * km = &ikev2_main;
+ uword * p;
+
+ ikev2_sa_free_all_vec(sa);
+
+ p = hash_get(km->sa_by_rspi, sa->rspi);
+ if (p)
+ {
+ hash_unset(km->sa_by_rspi, sa->rspi);
+ pool_put(km->sas, sa);
+ }
+}
+
+static void
+ikev2_generate_sa_init_data(ikev2_sa_t *sa)
+{
+ ikev2_sa_transform_t * t = 0, * t2;
+ ikev2_main_t * km = &ikev2_main;
+
+ if (sa->dh_group == IKEV2_TRANSFORM_DH_TYPE_NONE)
+ {
+ return;
+ }
+
+ /* check if received DH group is on our list of supported groups */
+ vec_foreach(t2, km->supported_transforms)
+ {
+ if (t2->type == IKEV2_TRANSFORM_TYPE_DH &&
+ sa->dh_group == t2->dh_type)
+ {
+ t = t2;
+ break;
+ }
+ }
+
+ if (!t)
+ {
+ clib_warning("unknown dh data group %u (data len %u)", sa->dh_group,
+ vec_len(sa->i_dh_data));
+ sa->dh_group = IKEV2_TRANSFORM_DH_TYPE_NONE;
+ return;
+ }
+
+ /* generate rspi */
+ RAND_bytes((u8 *) &sa->rspi, 8);
+
+ /* generate nonce */
+ sa->r_nonce = vec_new(u8, IKEV2_NONCE_SIZE);
+ RAND_bytes((u8 *) sa->r_nonce, IKEV2_NONCE_SIZE);
+
+ /* generate dh keys */
+ ikev2_generate_dh(sa, t);
+}
+
+static void
+ikev2_calc_keys(ikev2_sa_t *sa)
+{
+ u8 * tmp;
+ /* calculate SKEYSEED = prf(Ni | Nr, g^ir) */
+ u8 * skeyseed = 0;
+ u8 * s = 0;
+ ikev2_sa_transform_t * tr_encr, * tr_prf, * tr_integ;
+ tr_encr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ tr_prf = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_PRF);
+ tr_integ = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+
+ vec_append(s, sa->i_nonce);
+ vec_append(s, sa->r_nonce);
+ skeyseed = ikev2_calc_prf(tr_prf, s, sa->dh_shared_key);
+
+ /* Calculate S = Ni | Nr | SPIi | SPIr*/
+ u64 * spi;
+ vec_add2(s, tmp, 2 * sizeof(*spi));
+ spi = (u64 *) tmp;
+ spi[0] = clib_host_to_net_u64(sa->ispi);
+ spi[1] = clib_host_to_net_u64(sa->rspi);
+
+ /* calculate PRFplus */
+ u8 * keymat;
+ int len = tr_prf->key_trunc + /* SK_d */
+ tr_integ->key_len * 2 + /* SK_ai, SK_ar */
+ tr_encr->key_len * 2 + /* SK_ei, SK_er */
+ tr_prf->key_len * 2 ; /* SK_pi, SK_pr */
+
+ keymat = ikev2_calc_prfplus(tr_prf, skeyseed, s, len);
+ vec_free(skeyseed);
+ vec_free(s);
+
+ int pos = 0;
+
+ /* SK_d */
+ sa->sk_d = vec_new(u8, tr_prf->key_trunc);
+ memcpy(sa->sk_d, keymat + pos, tr_prf->key_trunc);
+ pos += tr_prf->key_trunc;
+
+ /* SK_ai */
+ sa->sk_ai = vec_new(u8, tr_integ->key_len);
+ memcpy(sa->sk_ai, keymat + pos, tr_integ->key_len);
+ pos += tr_integ->key_len;
+
+ /* SK_ar */
+ sa->sk_ar = vec_new(u8, tr_integ->key_len);
+ memcpy(sa->sk_ar, keymat + pos, tr_integ->key_len);
+ pos += tr_integ->key_len;
+
+ /* SK_ei */
+ sa->sk_ei = vec_new(u8, tr_encr->key_len);
+ memcpy(sa->sk_ei, keymat + pos, tr_encr->key_len);
+ pos += tr_encr->key_len;
+
+ /* SK_er */
+ sa->sk_er = vec_new(u8, tr_encr->key_len);
+ memcpy(sa->sk_er, keymat + pos, tr_encr->key_len);
+ pos += tr_encr->key_len;
+
+ /* SK_pi */
+ sa->sk_pi = vec_new(u8, tr_prf->key_len);
+ memcpy(sa->sk_pi, keymat + pos, tr_prf->key_len);
+ pos += tr_prf->key_len;
+
+ /* SK_pr */
+ sa->sk_pr = vec_new(u8, tr_prf->key_len);
+ memcpy(sa->sk_pr, keymat + pos, tr_prf->key_len);
+ pos += tr_prf->key_len;
+
+ vec_free(keymat);
+}
+
+static void
+ikev2_calc_child_keys(ikev2_sa_t *sa, ikev2_child_sa_t * child)
+{
+ u8 * s = 0;
+ ikev2_sa_transform_t * tr_prf, * ctr_encr, * ctr_integ;
+ tr_prf = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_PRF);
+ ctr_encr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ ctr_integ = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+
+ vec_append(s, sa->i_nonce);
+ vec_append(s, sa->r_nonce);
+ /* calculate PRFplus */
+ u8 * keymat;
+ int len = ctr_encr->key_len * 2 + ctr_integ->key_len * 2;
+
+ keymat = ikev2_calc_prfplus(tr_prf, sa->sk_d, s, len);
+ hexdump(keymat, vec_len(keymat));
+
+ int pos = 0;
+
+ /* SK_ei */
+ child->sk_ei = vec_new(u8, ctr_encr->key_len);
+ memcpy(child->sk_ei, keymat + pos, ctr_encr->key_len);
+ pos += ctr_encr->key_len;
+
+ /* SK_ai */
+ child->sk_ai = vec_new(u8, ctr_integ->key_len);
+ memcpy(child->sk_ai, keymat + pos, ctr_integ->key_len);
+ pos += ctr_integ->key_len;
+
+ /* SK_er */
+ child->sk_er = vec_new(u8, ctr_encr->key_len);
+ memcpy(child->sk_er, keymat + pos, ctr_encr->key_len);
+ pos += ctr_encr->key_len;
+
+ /* SK_ar */
+ child->sk_ar = vec_new(u8, ctr_integ->key_len);
+ memcpy(child->sk_ar, keymat + pos, ctr_integ->key_len);
+ pos += ctr_integ->key_len;
+
+ ASSERT(pos == len);
+
+ vec_free(keymat);
+}
+
+static void
+ikev2_process_sa_init_req(vlib_main_t * vm, ikev2_sa_t *sa, ike_header_t * ike)
+{
+ int p = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ u8 payload = ike->nextpayload;
+
+ clib_warning("ispi %lx rspi %lx nextpayload %x version %x "
+ "exchange %x flags %x msgid %x length %u",
+ clib_net_to_host_u64(ike->ispi),
+ clib_net_to_host_u64(ike->rspi),
+ payload, ike->version,
+ ike->exchange, ike->flags,
+ clib_net_to_host_u32(ike->msgid),
+ len);
+
+ sa->ispi = clib_net_to_host_u64(ike->ispi);
+
+ /* store whole IKE payload - needed for PSK auth */
+ vec_free(sa->last_sa_init_req_packet_data);
+ vec_add(sa->last_sa_init_req_packet_data, ike, len);
+
+ while (p < len && payload!= IKEV2_PAYLOAD_NONE) {
+ ike_payload_header_t * ikep = (ike_payload_header_t *) &ike->payload[p];
+ u32 plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(ike_payload_header_t))
+ return;
+
+ if (payload == IKEV2_PAYLOAD_SA)
+ {
+ ikev2_sa_free_proposal_vector(&sa->i_proposals);
+ sa->i_proposals = ikev2_parse_sa_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_KE)
+ {
+ ike_ke_payload_header_t * ke = (ike_ke_payload_header_t *) ikep;
+ sa->dh_group = clib_net_to_host_u16(ke->dh_group);
+ vec_free(sa->i_dh_data);
+ vec_add(sa->i_dh_data, ke->payload, plen - sizeof(*ke));
+ }
+ else if (payload == IKEV2_PAYLOAD_NONCE)
+ {
+ vec_free(sa->i_nonce);
+ vec_add(sa->i_nonce, ikep->payload, plen - sizeof(*ikep));
+ }
+ else if (payload == IKEV2_PAYLOAD_NOTIFY)
+ {
+ ikev2_notify_t * n = ikev2_parse_notify_payload(ikep);
+ vec_free(n);
+ }
+ else if (payload == IKEV2_PAYLOAD_VENDOR)
+ {
+ ikev2_parse_vendor_payload(ikep);
+ }
+ else
+ {
+ clib_warning("unknown payload %u flags %x length %u", payload, ikep->flags, plen);
+ if (ikep->flags & IKEV2_PAYLOAD_FLAG_CRITICAL) {
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ sa->unsupported_cp = payload;
+ return;
+ }
+ }
+
+ payload = ikep->nextpayload;
+ p+=plen;
+ }
+
+ ikev2_set_state(sa, IKEV2_STATE_SA_INIT);
+}
+
+static u8 *
+ikev2_decrypt_sk_payload(ikev2_sa_t * sa, ike_header_t * ike, u8 * payload)
+{
+ int p = 0;
+ u8 last_payload = 0;
+ u8 * hmac = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ ike_payload_header_t * ikep;
+ u32 plen;
+ ikev2_sa_transform_t * tr_integ;
+ tr_integ = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+
+ while (p < len &&
+ *payload != IKEV2_PAYLOAD_NONE && last_payload != IKEV2_PAYLOAD_SK)
+ {
+ ikep = (ike_payload_header_t *) &ike->payload[p];
+ plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(*ikep))
+ return 0;
+
+ if (*payload == IKEV2_PAYLOAD_SK)
+ {
+ clib_warning("received IKEv2 payload SK, len %u", plen - 4);
+ last_payload = *payload;
+ }
+ else
+ {
+ clib_warning("unknown payload %u flags %x length %u", payload, ikep->flags, plen);
+ if (ikep->flags & IKEV2_PAYLOAD_FLAG_CRITICAL)
+ {
+ sa->unsupported_cp = *payload;
+ return 0;
+ }
+ }
+
+ *payload = ikep->nextpayload;
+ p+=plen;
+ }
+
+ if (last_payload != IKEV2_PAYLOAD_SK) {
+ clib_warning("Last payload must be SK");
+ return 0;
+ }
+
+ hmac = ikev2_calc_integr(tr_integ, sa->sk_ai, (u8 *) ike,
+ len - tr_integ->key_trunc);
+
+ plen = plen - sizeof(*ikep) - tr_integ->key_trunc;
+
+ if (memcmp(hmac, &ikep->payload[plen], tr_integ->key_trunc))
+ {
+ clib_warning("message integrity check failed");
+ vec_free(hmac);
+ return 0;
+ }
+ vec_free(hmac);
+
+ return ikev2_decrypt_data(sa, ikep->payload, plen);
+}
+
+static void
+ikev2_initial_contact_cleanup (ikev2_sa_t * sa)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_sa_t * tmp;
+ u32 i, * delete = 0;
+ ikev2_child_sa_t * c;
+
+ if (!sa->initial_contact)
+ return;
+
+ /* find old IKE SAs with the same authenticated identity */
+ pool_foreach (tmp, km->sas, ({
+ if (tmp->i_id.type != sa->i_id.type ||
+ vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) ||
+ memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data)))
+ continue;
+
+ if (sa->rspi != tmp->rspi)
+ vec_add1(delete, tmp - km->sas);
+ }));
+
+ for (i = 0; i < vec_len(delete); i++)
+ {
+ tmp = pool_elt_at_index(km->sas, delete[i]);
+ vec_foreach(c, tmp->childs)
+ ikev2_delete_tunnel_interface(km->vnet_main, tmp, c);
+ ikev2_delete_sa(tmp);
+ }
+
+ vec_free(delete);
+ sa->initial_contact = 0;
+}
+
+static void
+ikev2_process_auth_req(vlib_main_t * vm, ikev2_sa_t *sa, ike_header_t * ike)
+{
+ ikev2_child_sa_t * first_child_sa;
+ int p = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ u8 payload = ike->nextpayload;
+ u8 * plaintext = 0;
+
+ ike_payload_header_t * ikep;
+ u32 plen;
+
+ clib_warning("ispi %lx rspi %lx nextpayload %x version %x "
+ "exchange %x flags %x msgid %x length %u",
+ clib_net_to_host_u64(ike->ispi),
+ clib_net_to_host_u64(ike->rspi),
+ payload, ike->version,
+ ike->exchange, ike->flags,
+ clib_net_to_host_u32(ike->msgid),
+ len);
+
+ ikev2_calc_keys(sa);
+
+ plaintext = ikev2_decrypt_sk_payload(sa, ike, &payload);
+
+ if (!plaintext)
+ {
+ if (sa->unsupported_cp)
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ goto cleanup_and_exit;
+ }
+
+ /* create 1st child SA */
+ ikev2_sa_free_all_child_sa(&sa->childs);
+ vec_add2(sa->childs, first_child_sa, 1);
+
+
+ /* process encrypted payload */
+ p = 0;
+ while (p < vec_len(plaintext) && payload != IKEV2_PAYLOAD_NONE)
+ {
+ ikep = (ike_payload_header_t *) &plaintext[p];
+ plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(ike_payload_header_t))
+ goto cleanup_and_exit;
+
+ if (payload == IKEV2_PAYLOAD_SA) /* 33 */
+ {
+ clib_warning("received payload SA, len %u", plen - sizeof(*ikep));
+ ikev2_sa_free_proposal_vector(&first_child_sa->i_proposals);
+ first_child_sa->i_proposals = ikev2_parse_sa_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_IDI) /* 35 */
+ {
+ ike_id_payload_header_t * id = (ike_id_payload_header_t *) ikep;
+
+ sa->i_id.type = id->id_type;
+ vec_free(sa->i_id.data);
+ vec_add(sa->i_id.data, id->payload, plen - sizeof(*id));
+
+ clib_warning("received payload IDi, len %u id_type %u",
+ plen - sizeof(*id), id->id_type);
+ }
+ else if (payload == IKEV2_PAYLOAD_AUTH) /* 39 */
+ {
+ ike_auth_payload_header_t * a = (ike_auth_payload_header_t *) ikep;
+
+ sa->i_auth.method = a->auth_method;
+ vec_free(sa->i_auth.data);
+ vec_add(sa->i_auth.data, a->payload, plen - sizeof(*a));
+
+ clib_warning("received payload AUTH, len %u auth_type %u",
+ plen - sizeof(*a), a->auth_method);
+ }
+ else if (payload == IKEV2_PAYLOAD_NOTIFY) /* 41 */
+ {
+ ikev2_notify_t * n = ikev2_parse_notify_payload(ikep);
+ if (n->msg_type == IKEV2_NOTIFY_MSG_INITIAL_CONTACT)
+ {
+ sa->initial_contact = 1;
+ }
+ vec_free(n);
+ }
+ else if (payload == IKEV2_PAYLOAD_VENDOR) /* 43 */
+ {
+ ikev2_parse_vendor_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_TSI) /* 44 */
+ {
+ clib_warning("received payload TSi, len %u", plen - sizeof(*ikep));
+
+ vec_free(first_child_sa->tsi);
+ first_child_sa->tsi = ikev2_parse_ts_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_TSR) /* 45 */
+ {
+ clib_warning("received payload TSr, len %u", plen - sizeof(*ikep));
+
+ vec_free(first_child_sa->tsr);
+ first_child_sa->tsr = ikev2_parse_ts_payload(ikep);
+ }
+ else
+ {
+ clib_warning("unknown payload %u flags %x length %u data %u",
+ payload, ikep->flags, plen - 4,
+ format_hex_bytes, ikep->payload, plen - 4);
+
+ if (ikep->flags & IKEV2_PAYLOAD_FLAG_CRITICAL) {
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ sa->unsupported_cp = payload;
+ return;
+ }
+ }
+
+ payload = ikep->nextpayload;
+ p += plen;
+ }
+
+cleanup_and_exit:
+ vec_free(plaintext);
+}
+
+static void
+ikev2_process_informational_req(vlib_main_t * vm, ikev2_sa_t *sa, ike_header_t * ike)
+{
+ int p = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ u8 payload = ike->nextpayload;
+ u8 * plaintext = 0;
+
+ ike_payload_header_t * ikep;
+ u32 plen;
+
+ clib_warning("ispi %lx rspi %lx nextpayload %x version %x "
+ "exchange %x flags %x msgid %x length %u",
+ clib_net_to_host_u64(ike->ispi),
+ clib_net_to_host_u64(ike->rspi),
+ payload, ike->version,
+ ike->exchange, ike->flags,
+ clib_net_to_host_u32(ike->msgid),
+ len);
+
+ plaintext = ikev2_decrypt_sk_payload(sa, ike, &payload);
+
+ if (!plaintext)
+ goto cleanup_and_exit;
+
+ /* process encrypted payload */
+ p = 0;
+ while (p < vec_len(plaintext) && payload != IKEV2_PAYLOAD_NONE)
+ {
+ ikep = (ike_payload_header_t *) &plaintext[p];
+ plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(ike_payload_header_t))
+ goto cleanup_and_exit;
+
+ if (payload == IKEV2_PAYLOAD_NOTIFY) /* 41 */
+ {
+ ikev2_notify_t * n = ikev2_parse_notify_payload(ikep);
+ if (n->msg_type == IKEV2_NOTIFY_MSG_AUTHENTICATION_FAILED)
+ ikev2_set_state(sa, IKEV2_STATE_AUTH_FAILED);
+ vec_free(n);
+ }
+ else if (payload == IKEV2_PAYLOAD_DELETE) /* 42 */
+ {
+ sa->del = ikev2_parse_delete_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_VENDOR) /* 43 */
+ {
+ ikev2_parse_vendor_payload(ikep);
+ }
+ else
+ {
+ clib_warning("unknown payload %u flags %x length %u data %u",
+ payload, ikep->flags, plen - 4,
+ format_hex_bytes, ikep->payload, plen - 4);
+
+ if (ikep->flags & IKEV2_PAYLOAD_FLAG_CRITICAL) {
+ sa->unsupported_cp = payload;
+ return;
+ }
+ }
+
+ payload = ikep->nextpayload;
+ p += plen;
+ }
+
+cleanup_and_exit:
+ vec_free(plaintext);
+}
+
+static void
+ikev2_process_create_child_sa_req(vlib_main_t * vm, ikev2_sa_t *sa, ike_header_t * ike)
+{
+ int p = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ u8 payload = ike->nextpayload;
+ u8 * plaintext = 0;
+ u8 rekeying = 0;
+ u8 i_nonce[IKEV2_NONCE_SIZE];
+
+ ike_payload_header_t * ikep;
+ u32 plen;
+ ikev2_notify_t * n = 0;
+ ikev2_ts_t * tsi = 0;
+ ikev2_ts_t * tsr = 0;
+ ikev2_sa_proposal_t * proposal = 0;
+ ikev2_child_sa_t * child_sa;
+
+ clib_warning("ispi %lx rspi %lx nextpayload %x version %x "
+ "exchange %x flags %x msgid %x length %u",
+ clib_net_to_host_u64(ike->ispi),
+ clib_net_to_host_u64(ike->rspi),
+ payload, ike->version,
+ ike->exchange, ike->flags,
+ clib_net_to_host_u32(ike->msgid),
+ len);
+
+ plaintext = ikev2_decrypt_sk_payload(sa, ike, &payload);
+
+ if (!plaintext)
+ goto cleanup_and_exit;
+
+ /* process encrypted payload */
+ p = 0;
+ while (p < vec_len(plaintext) && payload != IKEV2_PAYLOAD_NONE)
+ {
+ ikep = (ike_payload_header_t *) &plaintext[p];
+ plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(ike_payload_header_t))
+ goto cleanup_and_exit;
+
+ else if (payload == IKEV2_PAYLOAD_SA)
+ {
+ proposal = ikev2_parse_sa_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_NOTIFY)
+ {
+ n = ikev2_parse_notify_payload(ikep);
+ if (n->msg_type == IKEV2_NOTIFY_MSG_REKEY_SA)
+ {
+ rekeying = 1;
+ }
+ }
+ else if (payload == IKEV2_PAYLOAD_DELETE)
+ {
+ sa->del = ikev2_parse_delete_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_VENDOR)
+ {
+ ikev2_parse_vendor_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_NONCE)
+ {
+ memcpy(i_nonce, ikep->payload, plen - sizeof(*ikep));
+ }
+ else if (payload == IKEV2_PAYLOAD_TSI)
+ {
+ tsi = ikev2_parse_ts_payload(ikep);
+ }
+ else if (payload == IKEV2_PAYLOAD_TSR)
+ {
+ tsr = ikev2_parse_ts_payload(ikep);
+ }
+ else
+ {
+ clib_warning("unknown payload %u flags %x length %u data %u",
+ payload, ikep->flags, plen - 4,
+ format_hex_bytes, ikep->payload, plen - 4);
+
+ if (ikep->flags & IKEV2_PAYLOAD_FLAG_CRITICAL) {
+ sa->unsupported_cp = payload;
+ return;
+ }
+ }
+
+ payload = ikep->nextpayload;
+ p += plen;
+ }
+
+ if (rekeying)
+ {
+ ikev2_rekey_t * rekey;
+ child_sa = ikev2_sa_get_child(sa, n->spi, n->protocol_id);
+ if (!child_sa)
+ {
+ clib_warning("child SA spi %lx not found", n->spi);
+ goto cleanup_and_exit;
+ }
+ vec_add2(sa->rekey, rekey, 1);
+ rekey->protocol_id = n->protocol_id;
+ rekey->spi = n->spi;
+ rekey->i_proposal = proposal;
+ rekey->r_proposal = ikev2_select_proposal(proposal, IKEV2_PROTOCOL_ESP);
+ rekey->tsi = tsi;
+ rekey->tsr = tsr;
+ /* update Ni */
+ vec_free(sa->i_nonce);
+ vec_add(sa->i_nonce, i_nonce, IKEV2_NONCE_SIZE);
+ /* generate new Nr */
+ vec_free(sa->r_nonce);
+ sa->r_nonce = vec_new(u8, IKEV2_NONCE_SIZE);
+ RAND_bytes((u8 *) sa->r_nonce, IKEV2_NONCE_SIZE);
+ }
+
+cleanup_and_exit:
+ vec_free(plaintext);
+ vec_free(n);
+}
+
+static u8 *
+ikev2_sa_generate_authmsg(ikev2_sa_t *sa, int is_responder)
+{
+ u8 * authmsg = 0;
+ u8 * data;
+ u8 * nonce;
+ ikev2_id_t * id;
+ u8 * key;
+ u8 * packet_data;
+ ikev2_sa_transform_t * tr_prf;
+
+ tr_prf = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_PRF);
+
+ if (is_responder)
+ {
+ id = &sa->r_id;
+ key = sa->sk_pr;
+ nonce = sa->i_nonce;
+ packet_data = sa->last_sa_init_res_packet_data;
+ }
+ else
+ {
+ id = &sa->i_id;
+ key = sa->sk_pi;
+ nonce = sa->r_nonce;
+ packet_data = sa->last_sa_init_req_packet_data;
+ }
+
+ data = vec_new(u8, 4);
+ data[0] = id->type;
+ vec_append(data, id->data);
+
+ u8 * id_hash = ikev2_calc_prf(tr_prf, key, data);
+ vec_append(authmsg, packet_data);
+ vec_append(authmsg, nonce);
+ vec_append(authmsg, id_hash);
+ vec_free(id_hash);
+ vec_free(data);
+
+ return authmsg;
+}
+
+static int
+ikev2_ts_cmp(ikev2_ts_t * ts1, ikev2_ts_t * ts2)
+{
+ if (ts1->ts_type == ts2->ts_type && ts1->protocol_id == ts2->protocol_id &&
+ ts1->start_port == ts2->start_port && ts1->end_port == ts2->end_port &&
+ ts1->start_addr.as_u32 == ts2->start_addr.as_u32 &&
+ ts1->end_addr.as_u32 == ts2->end_addr.as_u32)
+ return 1;
+
+ return 0;
+}
+
+static void
+ikev2_sa_match_ts(ikev2_sa_t *sa)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_profile_t * p;
+ ikev2_ts_t * ts, * tsi = 0, * tsr = 0;
+
+ pool_foreach (p, km->profiles, ({
+
+ /* check id */
+ if (p->rem_id.type != sa->i_id.type ||
+ vec_len(p->rem_id.data) != vec_len(sa->i_id.data) ||
+ memcmp(p->rem_id.data, sa->i_id.data, vec_len(p->rem_id.data)))
+ continue;
+
+ vec_foreach(ts, sa->childs[0].tsi)
+ {
+ if (ikev2_ts_cmp(&p->rem_ts, ts))
+ {
+ tsi = vec_dup(ts);
+ break;
+ }
+ }
+
+ vec_foreach(ts, sa->childs[0].tsr)
+ {
+ if (ikev2_ts_cmp(&p->loc_ts, ts))
+ {
+ tsr = vec_dup(ts);
+ break;
+ }
+ }
+
+ break;
+ }));
+
+ if (tsi && tsr)
+ {
+ vec_free(sa->childs[0].tsi);
+ vec_free(sa->childs[0].tsr);
+ sa->childs[0].tsi = tsi;
+ sa->childs[0].tsr = tsr;
+ }
+ else
+ {
+ vec_free(tsi);
+ vec_free(tsr);
+ ikev2_set_state(sa, IKEV2_STATE_TS_UNACCEPTABLE);
+ }
+}
+
+static void
+ikev2_sa_auth(ikev2_sa_t *sa)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_profile_t * p, * sel_p = 0;
+ u8 * authmsg, * key_pad, * psk = 0, * auth = 0;
+ ikev2_sa_transform_t * tr_prf;
+
+ tr_prf = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_PRF);
+
+ /* only shared key and rsa signature */
+ if (!(sa->i_auth.method == IKEV2_AUTH_METHOD_SHARED_KEY_MIC ||
+ sa->i_auth.method == IKEV2_AUTH_METHOD_RSA_SIG))
+ {
+ clib_warning("unsupported authentication method %u", sa->i_auth.method);
+ ikev2_set_state(sa, IKEV2_STATE_AUTH_FAILED);
+ return;
+ }
+
+ key_pad = format(0, "%s", IKEV2_KEY_PAD);
+ authmsg = ikev2_sa_generate_authmsg(sa, 0);
+
+ pool_foreach (p, km->profiles, ({
+
+ /* check id */
+ if (p->rem_id.type != sa->i_id.type ||
+ vec_len(p->rem_id.data) != vec_len(sa->i_id.data) ||
+ memcmp(p->rem_id.data, sa->i_id.data, vec_len(p->rem_id.data)))
+ continue;
+
+ if (sa->i_auth.method == IKEV2_AUTH_METHOD_SHARED_KEY_MIC)
+ {
+ if (!p->auth.data ||
+ p->auth.method != IKEV2_AUTH_METHOD_SHARED_KEY_MIC)
+ continue;
+
+ psk = ikev2_calc_prf(tr_prf, p->auth.data, key_pad);
+ auth = ikev2_calc_prf(tr_prf, psk, authmsg);
+
+ if (!memcmp(auth, sa->i_auth.data, vec_len(sa->i_auth.data)))
+ {
+ ikev2_set_state(sa, IKEV2_STATE_AUTHENTICATED);
+ vec_free(auth);
+ sel_p = p;
+ break;
+ }
+
+ }
+ else if (sa->i_auth.method == IKEV2_AUTH_METHOD_RSA_SIG)
+ {
+ if (p->auth.method != IKEV2_AUTH_METHOD_RSA_SIG)
+ continue;
+
+ if (ikev2_verify_sign(p->auth.key, sa->i_auth.data, authmsg) == 1)
+ {
+ ikev2_set_state(sa, IKEV2_STATE_AUTHENTICATED);
+ sel_p = p;
+ break;
+ }
+ }
+
+ vec_free(auth);
+ vec_free(psk);
+ }));
+
+ vec_free(authmsg);
+
+ if (sa->state == IKEV2_STATE_AUTHENTICATED)
+ {
+ vec_free(sa->r_id.data);
+ sa->r_id.data = vec_dup(sel_p->loc_id.data);
+ sa->r_id.type = sel_p->loc_id.type;
+
+ /* generate our auth data */
+ authmsg = ikev2_sa_generate_authmsg(sa, 1);
+ if (sel_p->auth.method == IKEV2_AUTH_METHOD_SHARED_KEY_MIC)
+ {
+ sa->r_auth.data = ikev2_calc_prf(tr_prf, psk, authmsg);
+ sa->r_auth.method = IKEV2_AUTH_METHOD_SHARED_KEY_MIC;
+ }
+ else if (sel_p->auth.method == IKEV2_AUTH_METHOD_RSA_SIG)
+ {
+ sa->r_auth.data = ikev2_calc_sign(km->pkey, authmsg);
+ sa->r_auth.method = IKEV2_AUTH_METHOD_RSA_SIG;
+ }
+ vec_free(authmsg);
+
+ /* select transforms for 1st child sa */
+ ikev2_sa_free_proposal_vector(&sa->childs[0].r_proposals);
+ sa->childs[0].r_proposals = ikev2_select_proposal(sa->childs[0].i_proposals,
+ IKEV2_PROTOCOL_ESP);
+ }
+ else
+ {
+ ikev2_set_state(sa, IKEV2_STATE_AUTH_FAILED);
+ }
+ vec_free(psk);
+ vec_free(key_pad);
+}
+
+static int
+ikev2_create_tunnel_interface(vnet_main_t * vnm, ikev2_sa_t *sa, ikev2_child_sa_t * child)
+{
+ ipsec_add_del_tunnel_args_t a;
+ ikev2_sa_transform_t * tr;
+ u32 hw_if_index;
+ u8 encr_type = 0;
+
+ if (!child->r_proposals)
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ }
+
+ a.is_add = 1;
+ a.local_ip.as_u32 = sa->raddr.as_u32;
+ a.remote_ip.as_u32 = sa->iaddr.as_u32;
+ a.local_spi = child->i_proposals[0].spi;
+ a.remote_spi = child->r_proposals[0].spi;
+ a.anti_replay = 1;
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_ESN);
+ if (tr)
+ a.esn = tr->esn_type;
+ else
+ a.esn = 0;
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ if (tr)
+ {
+ if (tr->encr_type == IKEV2_TRANSFORM_ENCR_TYPE_AES_CBC && tr->key_len)
+ {
+ switch (tr->key_len)
+ {
+ case 16:
+ encr_type = IPSEC_CRYPTO_ALG_AES_CBC_128;
+ break;
+ case 24:
+ encr_type = IPSEC_CRYPTO_ALG_AES_CBC_192;
+ break;
+ case 32:
+ encr_type = IPSEC_CRYPTO_ALG_AES_CBC_256;
+ break;
+ default:
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ break;
+ }
+ }
+ else
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ }
+ }
+ else
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ }
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+ if (tr)
+ {
+ if (tr->integ_type != IKEV2_TRANSFORM_INTEG_TYPE_AUTH_HMAC_SHA1_96)
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ }
+ }
+ else
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NO_PROPOSAL_CHOSEN);
+ return 1;
+ }
+
+ hw_if_index = ipsec_add_del_tunnel_if(vnm, &a);
+ if (hw_if_index == VNET_API_ERROR_INVALID_VALUE)
+ {
+ clib_warning("create tunnel interface failed remote-ip %U remote-spi %u",
+ format_ip4_address, &sa->raddr, child->r_proposals[0].spi);
+ ikev2_set_state(sa, IKEV2_STATE_DELETED);
+ return hw_if_index;
+ }
+
+ ikev2_calc_child_keys(sa, child);
+
+ ipsec_set_interface_key(vnm, hw_if_index,
+ IPSEC_IF_SET_KEY_TYPE_LOCAL_CRYPTO,
+ encr_type,
+ child->sk_er);
+
+ ipsec_set_interface_key(vnm, hw_if_index,
+ IPSEC_IF_SET_KEY_TYPE_REMOTE_CRYPTO,
+ encr_type,
+ child->sk_ei);
+
+ ipsec_set_interface_key(vnm, hw_if_index,
+ IPSEC_IF_SET_KEY_TYPE_LOCAL_INTEG,
+ IPSEC_INTEG_ALG_SHA1_96,
+ child->sk_ar);
+
+ ipsec_set_interface_key(vnm, hw_if_index,
+ IPSEC_IF_SET_KEY_TYPE_REMOTE_INTEG,
+ IPSEC_INTEG_ALG_SHA1_96,
+ child->sk_ai);
+
+ return 0;
+}
+
+static int
+ikev2_delete_tunnel_interface(vnet_main_t * vnm, ikev2_sa_t *sa, ikev2_child_sa_t * child)
+{
+ ipsec_add_del_tunnel_args_t a;
+
+ if (!vec_len(child->r_proposals))
+ return 0;
+
+ a.is_add = 0;
+ a.local_ip.as_u32 = sa->raddr.as_u32;
+ a.remote_ip.as_u32 = sa->iaddr.as_u32;
+ a.local_spi = child->i_proposals[0].spi;
+ a.remote_spi = child->r_proposals[0].spi;
+
+ return ipsec_add_del_tunnel_if(vnm, &a);
+}
+
+static u32
+ikev2_generate_resp(ikev2_sa_t *sa, ike_header_t * ike)
+{
+ v8 * integ = 0;
+ ike_payload_header_t * ph;
+ u16 plen;
+ u32 tlen = 0;
+
+ ikev2_sa_transform_t * tr_encr, *tr_integ;
+ tr_encr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ tr_integ = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+
+ ikev2_payload_chain_t * chain = 0;
+ ikev2_payload_new_chain(chain);
+
+ if (ike->exchange == IKEV2_EXCHANGE_SA_INIT)
+ {
+ if (sa->r_proposals == 0)
+ {
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_NO_PROPOSAL_CHOSEN, 0);
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ }
+ else if (sa->dh_group == IKEV2_TRANSFORM_DH_TYPE_NONE)
+ {
+ u8 * data = vec_new(u8, 2);
+ ikev2_sa_transform_t * tr_dh;
+ tr_dh = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_DH);
+ ASSERT(tr_dh && tr_dh->dh_type);
+
+ data[0] = (tr_dh->dh_type >> 8) & 0xff;
+ data[1] = (tr_dh->dh_type) & 0xff;
+
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_INVALID_KE_PAYLOAD, data);
+ vec_free(data);
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ }
+ else if (sa->state == IKEV2_STATE_NOTIFY_AND_DELETE)
+ {
+ u8 * data = vec_new(u8, 1);
+
+ data[0] = sa->unsupported_cp;
+ ikev2_payload_add_notify(chain,
+ IKEV2_NOTIFY_MSG_UNSUPPORTED_CRITICAL_PAYLOAD,
+ data);
+ vec_free(data);
+ }
+ else
+ {
+ ike->rspi = clib_host_to_net_u64(sa->rspi);
+ ikev2_payload_add_sa(chain, sa->r_proposals);
+ ikev2_payload_add_ke(chain, sa->dh_group, sa->r_dh_data);
+ ikev2_payload_add_nonce(chain, sa->r_nonce);
+ }
+ }
+ else if (ike->exchange == IKEV2_EXCHANGE_IKE_AUTH)
+ {
+ if (sa->state == IKEV2_STATE_AUTHENTICATED)
+ {
+ ikev2_payload_add_id(chain, &sa->r_id, IKEV2_PAYLOAD_IDR);
+ ikev2_payload_add_auth(chain, &sa->r_auth);
+ ikev2_payload_add_sa(chain, sa->childs[0].r_proposals);
+ ikev2_payload_add_ts(chain, sa->childs[0].tsi, IKEV2_PAYLOAD_TSI);
+ ikev2_payload_add_ts(chain, sa->childs[0].tsr, IKEV2_PAYLOAD_TSR);
+ }
+ else if (sa->state == IKEV2_STATE_AUTH_FAILED)
+ {
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_AUTHENTICATION_FAILED, 0);
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ }
+ else if (sa->state == IKEV2_STATE_TS_UNACCEPTABLE)
+ {
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_TS_UNACCEPTABLE, 0);
+ ikev2_payload_add_id(chain, &sa->r_id, IKEV2_PAYLOAD_IDR);
+ ikev2_payload_add_auth(chain, &sa->r_auth);
+ }
+ else if (sa->state == IKEV2_STATE_NO_PROPOSAL_CHOSEN)
+ {
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_NO_PROPOSAL_CHOSEN, 0);
+ ikev2_payload_add_id(chain, &sa->r_id, IKEV2_PAYLOAD_IDR);
+ ikev2_payload_add_auth(chain, &sa->r_auth);
+ ikev2_payload_add_ts(chain, sa->childs[0].tsi, IKEV2_PAYLOAD_TSI);
+ ikev2_payload_add_ts(chain, sa->childs[0].tsr, IKEV2_PAYLOAD_TSR);
+ }
+ else if (sa->state == IKEV2_STATE_NOTIFY_AND_DELETE)
+ {
+ u8 * data = vec_new(u8, 1);
+
+ data[0] = sa->unsupported_cp;
+ ikev2_payload_add_notify(chain,
+ IKEV2_NOTIFY_MSG_UNSUPPORTED_CRITICAL_PAYLOAD,
+ data);
+ vec_free(data);
+ }
+ else
+ {
+ ikev2_set_state(sa, IKEV2_STATE_DELETED);
+ goto done;
+ }
+ }
+ else if (ike->exchange == IKEV2_EXCHANGE_INFORMATIONAL)
+ {
+ /* if pending delete */
+ if (sa->del)
+ {
+ /* The response to a request that deletes the IKE SA is an empty
+ INFORMATIONAL response. */
+ if (sa->del[0].protocol_id == IKEV2_PROTOCOL_IKE)
+ {
+ ikev2_set_state(sa, IKEV2_STATE_NOTIFY_AND_DELETE);
+ }
+ /* The response to a request that deletes ESP or AH SAs will contain
+ delete payloads for the paired SAs going in the other direction. */
+ else
+ {
+ ikev2_payload_add_delete(chain, sa->del);
+ }
+ vec_free(sa->del);
+ sa->del = 0;
+ }
+ /* received N(AUTHENTICATION_FAILED) */
+ else if (sa->state == IKEV2_STATE_AUTH_FAILED)
+ {
+ ikev2_set_state(sa, IKEV2_STATE_DELETED);
+ goto done;
+ }
+ /* received unsupported critical payload */
+ else if (sa->unsupported_cp)
+ {
+ u8 * data = vec_new(u8, 1);
+
+ data[0] = sa->unsupported_cp;
+ ikev2_payload_add_notify(chain,
+ IKEV2_NOTIFY_MSG_UNSUPPORTED_CRITICAL_PAYLOAD,
+ data);
+ vec_free(data);
+ sa->unsupported_cp = 0;
+ }
+ /* else send empty response */
+ }
+ else if (ike->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA)
+ {
+ if (sa->rekey)
+ {
+ ikev2_payload_add_sa(chain, sa->rekey[0].r_proposal);
+ ikev2_payload_add_nonce(chain, sa->r_nonce);
+ ikev2_payload_add_ts(chain, sa->rekey[0].tsi, IKEV2_PAYLOAD_TSI);
+ ikev2_payload_add_ts(chain, sa->rekey[0].tsr, IKEV2_PAYLOAD_TSR);
+ vec_del1(sa->rekey, 0);
+ }
+ else if (sa->unsupported_cp)
+ {
+ u8 * data = vec_new(u8, 1);
+
+ data[0] = sa->unsupported_cp;
+ ikev2_payload_add_notify(chain,
+ IKEV2_NOTIFY_MSG_UNSUPPORTED_CRITICAL_PAYLOAD,
+ data);
+ vec_free(data);
+ sa->unsupported_cp = 0;
+ }
+ else
+ {
+ ikev2_payload_add_notify(chain, IKEV2_NOTIFY_MSG_NO_ADDITIONAL_SAS, 0);
+ }
+ }
+
+ /* IKEv2 header */
+ ike->version = IKE_VERSION_2;
+ ike->flags = IKEV2_HDR_FLAG_RESPONSE;
+ ike->nextpayload = IKEV2_PAYLOAD_SK;
+ tlen = sizeof(*ike);
+
+
+ if (ike->exchange == IKEV2_EXCHANGE_SA_INIT)
+ {
+ tlen += vec_len(chain->data);
+ ike->nextpayload = chain->first_payload_type;
+ ike->length = clib_host_to_net_u32(tlen);
+ memcpy(ike->payload, chain->data, vec_len(chain->data));
+
+ /* store whole IKE payload - needed for PSK auth */
+ vec_free(sa->last_sa_init_res_packet_data);
+ vec_add(sa->last_sa_init_res_packet_data, ike, tlen);
+ }
+ else
+ {
+
+ ikev2_payload_chain_add_padding(chain, tr_encr->block_size);
+
+ /* SK payload */
+ plen = sizeof(*ph);
+ ph = (ike_payload_header_t *) &ike->payload[0];
+ ph->nextpayload = chain->first_payload_type;
+ ph->flags = 0;
+ int enc_len = ikev2_encrypt_data(sa, chain->data, ph->payload);
+ plen += enc_len;
+
+ /* add space for hmac */
+ plen += tr_integ->key_trunc;
+ tlen += plen;
+
+ /* payload and total length */
+ ph->length = clib_host_to_net_u16(plen);
+ ike->length = clib_host_to_net_u32(tlen);
+
+ /* calc integrity data for whole packet except hash itself */
+ integ = ikev2_calc_integr(tr_integ, sa->sk_ar, (u8 *) ike,
+ tlen - tr_integ->key_trunc);
+
+ memcpy(ike->payload + tlen - tr_integ->key_trunc - sizeof(*ike),
+ integ, tr_integ->key_trunc);
+
+ /* store whole IKE payload - needed for retransmit */
+ vec_free(sa->last_res_packet_data);
+ vec_add(sa->last_res_packet_data, ike, tlen);
+ }
+
+done:
+ ikev2_payload_destroy_chain (chain);
+ vec_free(integ);
+ return tlen;
+}
+
+static int
+ikev2_retransmit_sa_init (ike_header_t * ike,
+ ip4_address_t iaddr,
+ ip4_address_t raddr)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_sa_t * sa;
+
+ pool_foreach (sa, km->sas, ({
+ if (sa->ispi == clib_net_to_host_u64(ike->ispi) &&
+ sa->iaddr.as_u32 == iaddr.as_u32 &&
+ sa->raddr.as_u32 == raddr.as_u32)
+ {
+ int p = 0;
+ u32 len = clib_net_to_host_u32(ike->length);
+ u8 payload = ike->nextpayload;
+
+ while (p < len && payload!= IKEV2_PAYLOAD_NONE) {
+ ike_payload_header_t * ikep = (ike_payload_header_t *) &ike->payload[p];
+ u32 plen = clib_net_to_host_u16(ikep->length);
+
+ if (plen < sizeof(ike_payload_header_t))
+ return -1;
+
+ if (payload == IKEV2_PAYLOAD_NONCE)
+ {
+ if (!memcmp(sa->i_nonce, ikep->payload, plen - sizeof(*ikep)))
+ {
+ /* req is retransmit */
+ if (sa->state == IKEV2_STATE_SA_INIT)
+ {
+ ike_header_t * tmp;
+ tmp = (ike_header_t*)sa->last_sa_init_res_packet_data;
+ ike->ispi = tmp->ispi;
+ ike->rspi = tmp->rspi;
+ ike->nextpayload = tmp->nextpayload;
+ ike->version = tmp->version;
+ ike->exchange = tmp->exchange;
+ ike->flags = tmp->flags;
+ ike->msgid = tmp->msgid;
+ ike->length = tmp->length;
+ memcpy(ike->payload, tmp->payload,
+ clib_net_to_host_u32(tmp->length) - sizeof(*ike));
+ clib_warning("IKE_SA_INIT retransmit from %U to %U",
+ format_ip4_address, &raddr,
+ format_ip4_address, &iaddr);
+ return 1;
+ }
+ /* else ignore req */
+ else
+ {
+ clib_warning("IKE_SA_INIT ignore from %U to %U",
+ format_ip4_address, &raddr,
+ format_ip4_address, &iaddr);
+ return -1;
+ }
+ }
+ }
+ payload = ikep->nextpayload;
+ p+=plen;
+ }
+ }
+ }));
+
+ /* req is not retransmit */
+ return 0;
+}
+
+static int
+ikev2_retransmit_resp (ikev2_sa_t * sa, ike_header_t * ike)
+{
+ u32 msg_id = clib_net_to_host_u32(ike->msgid);
+
+ /* new req */
+ if (msg_id > sa->last_msg_id)
+ {
+ sa->last_msg_id = msg_id;
+ return 0;
+ }
+ /* retransmitted req */
+ else if (msg_id == sa->last_msg_id)
+ {
+ ike_header_t * tmp;
+ tmp = (ike_header_t*)sa->last_res_packet_data;
+ ike->ispi = tmp->ispi;
+ ike->rspi = tmp->rspi;
+ ike->nextpayload = tmp->nextpayload;
+ ike->version = tmp->version;
+ ike->exchange = tmp->exchange;
+ ike->flags = tmp->flags;
+ ike->msgid = tmp->msgid;
+ ike->length = tmp->length;
+ memcpy(ike->payload, tmp->payload,
+ clib_net_to_host_u32(tmp->length) - sizeof(*ike));
+ clib_warning("IKE msgid %u retransmit from %U to %U",
+ msg_id,
+ format_ip4_address, &sa->raddr,
+ format_ip4_address, &sa->iaddr);
+ return 1;
+ }
+ /* old req ignore */
+ else
+ {
+ clib_warning("IKE msgid %u req ignore from %U to %U",
+ msg_id,
+ format_ip4_address, &sa->raddr,
+ format_ip4_address, &sa->iaddr);
+ return -1;
+ }
+}
+
+static uword
+ikev2_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ ikev2_next_t next_index;
+ ikev2_main_t * km = &ikev2_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = IKEV2_NEXT_ERROR_DROP;
+ u32 sw_if_index0;
+ ip4_header_t * ip40;
+ udp_header_t * udp0;
+ ike_header_t * ike0;
+ ikev2_sa_t * sa0 = 0;
+ int len = 0;
+ int r;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ike0 = vlib_buffer_get_current (b0);
+ vlib_buffer_advance(b0, - sizeof(*udp0));
+ udp0 = vlib_buffer_get_current (b0);
+ vlib_buffer_advance(b0, - sizeof(*ip40));
+ ip40 = vlib_buffer_get_current (b0);
+
+ if (ike0->version != IKE_VERSION_2)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_NOT_IKEV2, 1);
+ goto dispatch0;
+ }
+
+ if (ike0->exchange == IKEV2_EXCHANGE_SA_INIT)
+ {
+ ikev2_sa_t sa; /* temporary store for SA */
+ sa0 = &sa;
+ memset (sa0, 0, sizeof (*sa0));
+
+ if (ike0->rspi == 0)
+ {
+ sa0->raddr.as_u32 = ip40->dst_address.as_u32;
+ sa0->iaddr.as_u32 = ip40->src_address.as_u32;
+
+ r = ikev2_retransmit_sa_init(ike0, sa0->iaddr, sa0->raddr);
+ if (r == 1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_SA_INIT_RETRANSMIT,
+ 1);
+ len = clib_net_to_host_u32(ike0->length);
+ goto dispatch0;
+ }
+ else if (r == -1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_SA_INIT_IGNORE,
+ 1);
+ goto dispatch0;
+ }
+
+ ikev2_process_sa_init_req(vm, sa0, ike0);
+
+ if (sa0->state == IKEV2_STATE_SA_INIT)
+ {
+ ikev2_sa_free_proposal_vector(&sa0->r_proposals);
+ sa0->r_proposals = ikev2_select_proposal(sa0->i_proposals,
+ IKEV2_PROTOCOL_IKE);
+ ikev2_generate_sa_init_data(sa0);
+ }
+
+ if (sa0->state == IKEV2_STATE_SA_INIT ||
+ sa0->state == IKEV2_STATE_NOTIFY_AND_DELETE)
+ {
+ len = ikev2_generate_resp(sa0, ike0);
+ }
+
+ if (sa0->state == IKEV2_STATE_SA_INIT)
+ {
+ /* add SA to the pool */
+ pool_get (km->sas, sa0);
+ memcpy(sa0, &sa, sizeof(*sa0));
+ hash_set (km->sa_by_rspi, sa0->rspi, sa0 - km->sas);
+ }
+ else
+ {
+ ikev2_sa_free_all_vec(sa0);
+ }
+ }
+ }
+ else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH)
+ {
+ uword * p;
+ p = hash_get(km->sa_by_rspi, clib_net_to_host_u64(ike0->rspi));
+ if (p)
+ {
+ sa0 = pool_elt_at_index (km->sas, p[0]);
+
+ r = ikev2_retransmit_resp(sa0, ike0);
+ if (r == 1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_RETRANSMIT,
+ 1);
+ len = clib_net_to_host_u32(ike0->length);
+ goto dispatch0;
+ }
+ else if (r == -1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_IGNORE,
+ 1);
+ goto dispatch0;
+ }
+
+ ikev2_process_auth_req(vm, sa0, ike0);
+ ikev2_sa_auth(sa0);
+ if (sa0->state == IKEV2_STATE_AUTHENTICATED)
+ {
+ ikev2_initial_contact_cleanup(sa0);
+ ikev2_sa_match_ts(sa0);
+ if (sa0->state != IKEV2_STATE_TS_UNACCEPTABLE)
+ ikev2_create_tunnel_interface(km->vnet_main, sa0,
+ &sa0->childs[0]);
+ }
+ len = ikev2_generate_resp(sa0, ike0);
+ }
+ }
+ else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL)
+ {
+ uword * p;
+ p = hash_get(km->sa_by_rspi, clib_net_to_host_u64(ike0->rspi));
+ if (p)
+ {
+ sa0 = pool_elt_at_index (km->sas, p[0]);
+
+ r = ikev2_retransmit_resp(sa0, ike0);
+ if (r == 1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_RETRANSMIT,
+ 1);
+ len = clib_net_to_host_u32(ike0->length);
+ goto dispatch0;
+ }
+ else if (r == -1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_IGNORE,
+ 1);
+ goto dispatch0;
+ }
+
+ ikev2_process_informational_req(vm, sa0, ike0);
+ if (sa0->del)
+ {
+ if (sa0->del[0].protocol_id != IKEV2_PROTOCOL_IKE)
+ {
+ ikev2_delete_t * d, * tmp, * resp = 0;
+ vec_foreach(d, sa0->del)
+ {
+ ikev2_child_sa_t * ch_sa;
+ ch_sa = ikev2_sa_get_child(sa0, d->spi,
+ d->protocol_id);
+ if (ch_sa)
+ {
+ ikev2_delete_tunnel_interface(km->vnet_main,
+ sa0, ch_sa);
+ vec_add2(resp, tmp, 1);
+ tmp->protocol_id = d->protocol_id;
+ tmp->spi = ch_sa->r_proposals[0].spi;
+ ikev2_sa_del_child_sa(sa0, ch_sa);
+ }
+ }
+ vec_free(sa0->del);
+ sa0->del = resp;
+ }
+ }
+ len = ikev2_generate_resp(sa0, ike0);
+ }
+ }
+ else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA)
+ {
+ uword * p;
+ p = hash_get(km->sa_by_rspi, clib_net_to_host_u64(ike0->rspi));
+ if (p)
+ {
+ sa0 = pool_elt_at_index (km->sas, p[0]);
+
+ r = ikev2_retransmit_resp(sa0, ike0);
+ if (r == 1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_RETRANSMIT,
+ 1);
+ len = clib_net_to_host_u32(ike0->length);
+ goto dispatch0;
+ }
+ else if (r == -1)
+ {
+ vlib_node_increment_counter(vm, ikev2_node.index,
+ IKEV2_ERROR_IKE_REQ_IGNORE,
+ 1);
+ goto dispatch0;
+ }
+
+ ikev2_process_create_child_sa_req(vm, sa0, ike0);
+ if (sa0->rekey)
+ {
+ if (sa0->rekey[0].protocol_id != IKEV2_PROTOCOL_IKE)
+ {
+ ikev2_child_sa_t * child;
+ vec_add2(sa0->childs, child, 1);
+ child->r_proposals = sa0->rekey[0].r_proposal;
+ child->i_proposals = sa0->rekey[0].i_proposal;
+ child->tsi = sa0->rekey[0].tsi;
+ child->tsr = sa0->rekey[0].tsr;
+ ikev2_create_tunnel_interface(km->vnet_main, sa0,
+ child);
+ }
+ len = ikev2_generate_resp(sa0, ike0);
+ }
+ }
+ }
+ else
+ {
+ clib_warning("IKEv2 exchange %u packet received from %U to %U",
+ ike0->exchange,
+ format_ip4_address, ip40->src_address.as_u8,
+ format_ip4_address, ip40->dst_address.as_u8);
+ hexdump((u8 *) ip40, b0->current_length);
+ }
+
+dispatch0:
+ /* if we are sending packet back, rewrite headers */
+ if (len)
+ {
+ next0 = IKEV2_NEXT_IP4_LOOKUP;
+ ip40->dst_address.as_u32 = sa0->iaddr.as_u32;
+ ip40->src_address.as_u32 = sa0->raddr.as_u32;
+ udp0->length = clib_host_to_net_u16(len + sizeof(udp_header_t));
+ udp0->checksum = 0;
+ b0->current_length = len + sizeof(ip4_header_t) + sizeof(udp_header_t);
+ ip40->length = clib_host_to_net_u16(b0->current_length);
+ ip40->checksum = ip4_header_checksum (ip40);
+#if 0
+ clib_warning("sending response:");
+ hexdump(vlib_buffer_get_current (b0), b0->current_length);
+#endif
+ }
+ /* delete sa */
+ if (sa0 && (sa0->state == IKEV2_STATE_DELETED ||
+ sa0->state == IKEV2_STATE_NOTIFY_AND_DELETE))
+ {
+ ikev2_child_sa_t * c;
+
+ vec_foreach(c, sa0->childs)
+ ikev2_delete_tunnel_interface(km->vnet_main, sa0, c);
+
+ ikev2_delete_sa(sa0);
+ }
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ ikev2_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ikev2_node.index,
+ IKEV2_ERROR_PROCESSED, frame->n_vectors);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ikev2_node) = {
+ .function = ikev2_node_fn,
+ .name = "ikev2",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ikev2_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ikev2_error_strings),
+ .error_strings = ikev2_error_strings,
+
+ .n_next_nodes = IKEV2_N_NEXT,
+
+ .next_nodes = {
+ [IKEV2_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IKEV2_NEXT_ERROR_DROP] = "error-drop",
+ },
+};
+
+
+static ikev2_profile_t *
+ikev2_profile_index_by_name(u8 * name)
+{
+ ikev2_main_t * km = &ikev2_main;
+ uword * p;
+
+ p = mhash_get (&km->profile_index_by_name, name);
+ if (!p)
+ return 0;
+
+ return pool_elt_at_index(km->profiles, p[0]);
+}
+
+clib_error_t *
+ikev2_set_local_key(vlib_main_t * vm, u8 * file)
+{
+ ikev2_main_t * km = &ikev2_main;
+
+ km->pkey = ikev2_load_key_file(file);
+ if (km->pkey == NULL)
+ return clib_error_return(0, "load key '%s' failed", file);
+
+ return 0;
+}
+
+clib_error_t *
+ikev2_add_del_profile(vlib_main_t * vm, u8 * name, int is_add)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_profile_t * p;
+
+ if (is_add)
+ {
+ if (ikev2_profile_index_by_name(name))
+ return clib_error_return(0, "policy %v already exists", name);
+
+ pool_get (km->profiles, p);
+ memset(p, 0, sizeof(*p));
+ p->name = vec_dup(name);
+ uword index = p - km->profiles;
+ mhash_set_mem (&km->profile_index_by_name, name, &index, 0);
+ }
+ else
+ {
+ p = ikev2_profile_index_by_name(name);
+ if (!p)
+ return clib_error_return(0, "policy %v does not exists", name);
+
+ vec_free (p->name);
+ pool_put (km->profiles, p);
+ mhash_unset (&km->profile_index_by_name, name, 0);
+ }
+ return 0;
+}
+
+clib_error_t *
+ikev2_set_profile_auth(vlib_main_t * vm, u8 * name, u8 auth_method,
+ u8 * auth_data, u8 data_hex_format)
+{
+ ikev2_profile_t * p;
+ clib_error_t * r;
+
+ p = ikev2_profile_index_by_name(name);
+
+ if (!p) {
+ r = clib_error_return(0, "unknown profile %v", name);
+ return r;
+ }
+ vec_free(p->auth.data);
+ p->auth.method = auth_method;
+ p->auth.data = vec_dup(auth_data);
+ p->auth.hex = data_hex_format;
+
+ if (auth_method == IKEV2_AUTH_METHOD_RSA_SIG)
+ {
+ if (p->auth.key)
+ EVP_PKEY_free(p->auth.key);
+ p->auth.key = ikev2_load_cert_file(auth_data);
+ if (p->auth.key == NULL)
+ return clib_error_return(0, "load cert '%s' failed", auth_data);
+ }
+
+ return 0;
+}
+
+clib_error_t *
+ikev2_set_profile_id(vlib_main_t * vm, u8 * name, u8 id_type, u8 * data,
+ int is_local)
+{
+ ikev2_profile_t * p;
+ clib_error_t * r;
+
+ if (id_type > IKEV2_ID_TYPE_ID_RFC822_ADDR && id_type < IKEV2_ID_TYPE_ID_KEY_ID)
+ {
+ r = clib_error_return(0, "unsupported identity type %U",
+ format_ikev2_id_type, id_type);
+ return r;
+ }
+
+ p = ikev2_profile_index_by_name(name);
+
+ if (!p) {
+ r = clib_error_return(0, "unknown profile %v", name);
+ return r;
+ }
+
+ if (is_local)
+ {
+ vec_free(p->loc_id.data);
+ p->loc_id.type = id_type;
+ p->loc_id.data = vec_dup(data);
+ }
+ else
+ {
+ vec_free(p->rem_id.data);
+ p->rem_id.type = id_type;
+ p->rem_id.data = vec_dup(data);
+ }
+
+ return 0;
+}
+
+clib_error_t *
+ikev2_set_profile_ts(vlib_main_t * vm, u8 * name, u8 protocol_id,
+ u16 start_port, u16 end_port, ip4_address_t start_addr,
+ ip4_address_t end_addr, int is_local)
+{
+ ikev2_profile_t * p;
+ clib_error_t * r;
+
+ p = ikev2_profile_index_by_name(name);
+
+ if (!p) {
+ r = clib_error_return(0, "unknown profile %v", name);
+ return r;
+ }
+
+ if (is_local)
+ {
+ p->loc_ts.start_addr.as_u32= start_addr.as_u32;
+ p->loc_ts.end_addr.as_u32 = end_addr.as_u32;
+ p->loc_ts.start_port = start_port;
+ p->loc_ts.end_port = end_port;
+ p->loc_ts.protocol_id = protocol_id;
+ p->loc_ts.ts_type = 7;
+ }
+ else
+ {
+ p->rem_ts.start_addr.as_u32 = start_addr.as_u32;
+ p->rem_ts.end_addr.as_u32 = end_addr.as_u32;
+ p->rem_ts.start_port = start_port;
+ p->rem_ts.end_port = end_port;
+ p->rem_ts.protocol_id = protocol_id;
+ p->rem_ts.ts_type = 7;
+ }
+
+ return 0;
+}
+
+
+clib_error_t *
+ikev2_init (vlib_main_t * vm)
+{
+ ikev2_main_t * km = &ikev2_main;
+ clib_error_t * error;
+
+ memset (km, 0, sizeof (ikev2_main_t));
+ km->vnet_main = vnet_get_main();
+ km->vlib_main = vm;
+
+ ikev2_crypto_init(km);
+
+ km->sa_by_rspi = hash_create (0, sizeof (uword));
+ mhash_init_vec_string (&km->profile_index_by_name, sizeof (uword));
+
+ if ((error = vlib_call_init_function (vm, ikev2_cli_init)))
+ return error;
+
+ udp_register_dst_port (vm, 500, ikev2_node.index, 1);
+
+ return 0;
+}
+
+
diff --git a/vnet/vnet/ipsec/ikev2.h b/vnet/vnet/ipsec/ikev2.h
new file mode 100644
index 00000000000..fd0d75a62f3
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ikev2_h__
+#define __included_ikev2_h__
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+
+#include <vppinfra/error.h>
+
+#define IKEV2_NONCE_SIZE 32
+
+#define IKEV2_KEY_PAD "Key Pad for IKEv2"
+
+typedef u8 v8;
+
+vlib_node_registration_t ikev2_node;
+
+typedef CLIB_PACKED (struct {
+ u64 ispi;
+ u64 rspi;
+ u8 nextpayload;
+ u8 version;
+ u8 exchange;
+ u8 flags;
+ u32 msgid;
+ u32 length;
+ u8 payload[0];
+}) ike_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u16 dh_group;
+ u8 reserved[2];
+ u8 payload[0];
+}) ike_ke_payload_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 payload[0];
+}) ike_payload_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 auth_method;
+ u8 reserved[3];
+ u8 payload[0];
+}) ike_auth_payload_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 id_type;
+ u8 reserved[3];
+ u8 payload[0];
+}) ike_id_payload_header_t;
+
+#define IKE_VERSION_2 0x20
+
+#define IKEV2_EXCHANGE_SA_INIT 34
+#define IKEV2_EXCHANGE_IKE_AUTH 35
+#define IKEV2_EXCHANGE_CREATE_CHILD_SA 36
+#define IKEV2_EXCHANGE_INFORMATIONAL 37
+
+#define IKEV2_HDR_FLAG_INITIATOR (1<<3)
+#define IKEV2_HDR_FLAG_VERSION (1<<4)
+#define IKEV2_HDR_FLAG_RESPONSE (1<<5)
+
+#define IKEV2_PAYLOAD_FLAG_CRITICAL (1<<7)
+
+#define IKEV2_PAYLOAD_NONE 0
+#define IKEV2_PAYLOAD_SA 33
+#define IKEV2_PAYLOAD_KE 34
+#define IKEV2_PAYLOAD_IDI 35
+#define IKEV2_PAYLOAD_IDR 36
+#define IKEV2_PAYLOAD_AUTH 39
+#define IKEV2_PAYLOAD_NONCE 40
+#define IKEV2_PAYLOAD_NOTIFY 41
+#define IKEV2_PAYLOAD_DELETE 42
+#define IKEV2_PAYLOAD_VENDOR 43
+#define IKEV2_PAYLOAD_TSI 44
+#define IKEV2_PAYLOAD_TSR 45
+#define IKEV2_PAYLOAD_SK 46
+
+typedef enum {
+ IKEV2_PROTOCOL_IKE = 1,
+ IKEV2_PROTOCOL_AH = 2,
+ IKEV2_PROTOCOL_ESP = 3,
+} ikev2_protocol_id_t;
+
+#define foreach_ikev2_notify_msg_type \
+ _( 0, NONE) \
+ _( 1, UNSUPPORTED_CRITICAL_PAYLOAD) \
+ _( 4, INVALID_IKE_SPI) \
+ _( 5, INVALID_MAJOR_VERSION) \
+ _( 7, INVALID_SYNTAX) \
+ _( 8, INVALID_MESSAGE_ID) \
+ _( 11, INVALID_SPI) \
+ _( 14, NO_PROPOSAL_CHOSEN) \
+ _( 17, INVALID_KE_PAYLOAD) \
+ _( 24, AUTHENTICATION_FAILED) \
+ _( 34, SINGLE_PAIR_REQUIRED) \
+ _( 35, NO_ADDITIONAL_SAS) \
+ _( 36, INTERNAL_ADDRESS_FAILURE) \
+ _( 37, FAILED_CP_REQUIRED) \
+ _( 38, TS_UNACCEPTABLE) \
+ _( 39, INVALID_SELECTORS) \
+ _( 40, UNACCEPTABLE_ADDRESSES) \
+ _( 41, UNEXPECTED_NAT_DETECTED) \
+ _( 42, USE_ASSIGNED_HoA) \
+ _( 43, TEMPORARY_FAILURE) \
+ _( 44, CHILD_SA_NOT_FOUND) \
+ _( 45, INVALID_GROUP_ID) \
+ _( 46, AUTHORIZATION_FAILED) \
+ _(16384, INITIAL_CONTACT) \
+ _(16385, SET_WINDOW_SIZE) \
+ _(16386, ADDITIONAL_TS_POSSIBLE) \
+ _(16387, IPCOMP_SUPPORTED) \
+ _(16388, NAT_DETECTION_SOURCE_IP) \
+ _(16389, NAT_DETECTION_DESTINATION_IP) \
+ _(16390, COOKIE) \
+ _(16391, USE_TRANSPORT_MODE) \
+ _(16392, HTTP_CERT_LOOKUP_SUPPORTED) \
+ _(16393, REKEY_SA) \
+ _(16394, ESP_TFC_PADDING_NOT_SUPPORTED) \
+ _(16395, NON_FIRST_FRAGMENTS_ALSO) \
+ _(16396, MOBIKE_SUPPORTED) \
+ _(16397, ADDITIONAL_IP4_ADDRESS) \
+ _(16398, ADDITIONAL_IP6_ADDRESS) \
+ _(16399, NO_ADDITIONAL_ADDRESSES) \
+ _(16400, UPDATE_SA_ADDRESSES) \
+ _(16401, COOKIE2) \
+ _(16402, NO_NATS_ALLOWED) \
+ _(16403, AUTH_LIFETIME) \
+ _(16404, MULTIPLE_AUTH_SUPPORTED) \
+ _(16405, ANOTHER_AUTH_FOLLOWS) \
+ _(16406, REDIRECT_SUPPORTED) \
+ _(16407, REDIRECT) \
+ _(16408, REDIRECTED_FROM) \
+ _(16409, TICKET_LT_OPAQUE) \
+ _(16410, TICKET_REQUEST) \
+ _(16411, TICKET_ACK) \
+ _(16412, TICKET_NACK) \
+ _(16413, TICKET_OPAQUE) \
+ _(16414, LINK_ID) \
+ _(16415, USE_WESP_MODE) \
+ _(16416, ROHC_SUPPORTED) \
+ _(16417, EAP_ONLY_AUTHENTICATION) \
+ _(16418, CHILDLESS_IKEV2_SUPPORTED) \
+ _(16419, QUICK_CRASH_DETECTION) \
+ _(16420, IKEV2_MESSAGE_ID_SYNC_SUPPORTED) \
+ _(16421, IPSEC_REPLAY_COUNTER_SYNC_SUPPORTED) \
+ _(16422, IKEV2_MESSAGE_ID_SYNC) \
+ _(16423, IPSEC_REPLAY_COUNTER_SYNC) \
+ _(16424, SECURE_PASSWORD_METHODS) \
+ _(16425, PSK_PERSIST) \
+ _(16426, PSK_CONFIRM) \
+ _(16427, ERX_SUPPORTED) \
+ _(16428, IFOM_CAPABILITY) \
+ _(16429, SENDER_REQUEST_ID) \
+ _(16430, IKEV2_FRAGMENTATION_SUPPORTED) \
+ _(16431, SIGNATURE_HASH_ALGORITHMS)
+
+
+typedef enum {
+#define _(v,f) IKEV2_NOTIFY_MSG_##f = v,
+ foreach_ikev2_notify_msg_type
+#undef _
+} ikev2_notify_msg_type_t;
+
+#define foreach_ikev2_transform_type \
+ _(0, UNDEFINED, "undefinded") \
+ _(1, ENCR, "encr") \
+ _(2, PRF, "prf") \
+ _(3, INTEG, "integ") \
+ _(4, DH, "dh-group") \
+ _(5, ESN, "esn")
+
+typedef enum {
+#define _(v,f,s) IKEV2_TRANSFORM_TYPE_##f = v,
+ foreach_ikev2_transform_type
+#undef _
+ IKEV2_TRANSFORM_NUM_TYPES
+} ikev2_transform_type_t;
+
+
+#define foreach_ikev2_transform_encr_type \
+ _(1 , DES_IV64, "des-iv64") \
+ _(2 , DES, "des") \
+ _(3 , 3DES, "3des") \
+ _(4 , RC5, "rc5") \
+ _(5 , IDEA, "idea") \
+ _(6 , CAST, "cast") \
+ _(7 , BLOWFISH, "blowfish") \
+ _(8 , 3IDEA, "3idea") \
+ _(9 , DES_IV32, "des-iv32") \
+ _(11, NULL, "null") \
+ _(12, AES_CBC, "aes-cbc") \
+ _(13, AES_CTR, "aes-ctr")
+
+typedef enum {
+#define _(v,f,str) IKEV2_TRANSFORM_ENCR_TYPE_##f = v,
+ foreach_ikev2_transform_encr_type
+#undef _
+} ikev2_transform_encr_type_t;
+
+#define foreach_ikev2_transform_prf_type \
+ _(1, PRF_HMAC_MD5, "hmac-md5") \
+ _(2, PRF_HMAC_SHA1, "hmac-sha1") \
+ _(3, PRF_MAC_TIGER, "mac-tiger") \
+ _(4, PRF_AES128_XCBC, "aes128-xcbc") \
+ _(5, PRF_HMAC_SHA2_256, "hmac-sha2-256") \
+ _(6, PRF_HMAC_SHA2_384, "hmac-sha2-384") \
+ _(7, PRF_HMAC_SHA2_512, "hmac-sha2-512") \
+ _(8, PRF_AES128_CMAC, "aes128-cmac")
+
+typedef enum {
+#define _(v,f,str) IKEV2_TRANSFORM_PRF_TYPE_##f = v,
+ foreach_ikev2_transform_prf_type
+#undef _
+} ikev2_transform_prf_type_t;
+
+#define foreach_ikev2_transform_integ_type \
+ _(0, NONE, "none") \
+ _(1, AUTH_HMAC_MD5_96, "md5-96") \
+ _(2, AUTH_HMAC_SHA1_96, "sha1-96") \
+ _(3, AUTH_DES_MAC, "des-mac") \
+ _(4, AUTH_KPDK_MD5, "kpdk-md5") \
+ _(5, AUTH_AES_XCBC_96, "aes-xcbc-96") \
+ _(6, AUTH_HMAC_MD5_128, "md5-128") \
+ _(7, AUTH_HMAC_SHA1_160, "sha1-160") \
+ _(8, AUTH_AES_CMAC_96, "cmac-96") \
+ _(9, AUTH_AES_128_GMAC, "aes-128-gmac") \
+ _(10, AUTH_AES_192_GMAC, "aes-192-gmac") \
+ _(11, AUTH_AES_256_GMAC, "aes-256-gmac") \
+ _(12, AUTH_HMAC_SHA2_256_128, "hmac-sha2-256-128") \
+ _(13, AUTH_HMAC_SHA2_384_192, "hmac-sha2-384-192") \
+ _(14, AUTH_HMAC_SHA2_512_256, "hmac-sha2-512-256")
+
+typedef enum {
+#define _(v,f, str) IKEV2_TRANSFORM_INTEG_TYPE_##f = v,
+ foreach_ikev2_transform_integ_type
+#undef _
+} ikev2_transform_integ_type_t;
+
+#if defined(OPENSSL_NO_CISCO_FECDH)
+#define foreach_ikev2_transform_dh_type \
+ _(0, NONE, "none") \
+ _(1, MODP_768, "modp-768") \
+ _(2, MODP_1024, "modp-1024") \
+ _(5, MODP_1536, "modp-1536") \
+ _(14, MODP_2048, "modp-2048") \
+ _(15, MODP_3072, "modp-3072") \
+ _(16, MODP_4096, "modp-4096") \
+ _(17, MODP_6144, "modp-6144") \
+ _(18, MODP_8192, "modp-8192") \
+ _(19, ECP_256, "ecp-256") \
+ _(20, ECP_384, "ecp-384") \
+ _(21, ECP_521, "ecp-521") \
+ _(22, MODP_1024_160, "modp-1024-160") \
+ _(23, MODP_2048_224, "modp-2048-224") \
+ _(24, MODP_2048_256, "modp-2048-256") \
+ _(25, ECP_192, "ecp-192") \
+ _(26, ECP_224, "ecp-224") \
+ _(27, BRAINPOOL_224, "brainpool-224") \
+ _(28, BRAINPOOL_256, "brainpool-256") \
+ _(29, BRAINPOOL_384, "brainpool-384") \
+ _(30, BRAINPOOL_512, "brainpool-512")
+#else
+#define foreach_ikev2_transform_dh_type \
+ _(0, NONE, "none") \
+ _(1, MODP_768, "modp-768") \
+ _(2, MODP_1024, "modp-1024") \
+ _(5, MODP_1536, "modp-1536") \
+ _(14, MODP_2048, "modp-2048") \
+ _(15, MODP_3072, "modp-3072") \
+ _(16, MODP_4096, "modp-4096") \
+ _(17, MODP_6144, "modp-6144") \
+ _(18, MODP_8192, "modp-8192") \
+ _(19, ECP_256, "ecp-256") \
+ _(20, ECP_384, "ecp-384") \
+ _(21, ECP_521, "ecp-521") \
+ _(22, MODP_1024_160, "modp-1024-160") \
+ _(23, MODP_2048_224, "modp-2048-224") \
+ _(24, MODP_2048_256, "modp-2048-256") \
+ _(25, ECP_192, "ecp-192")
+#endif
+
+typedef enum {
+#define _(v,f, str) IKEV2_TRANSFORM_DH_TYPE_##f = v,
+ foreach_ikev2_transform_dh_type
+#undef _
+} ikev2_transform_dh_type_t;
+
+#define foreach_ikev2_transform_esn_type \
+ _(0, NO_ESN, "no") \
+ _(1, ESN, "yes")
+
+typedef enum {
+#define _(v,f,str) IKEV2_TRANSFORM_ESN_TYPE_##f = v,
+ foreach_ikev2_transform_esn_type
+#undef _
+} ikev2_transform_esn_type_t;
+
+#define foreach_ikev2_auth_method \
+ _( 1, RSA_SIG, "rsa-sig") \
+ _( 2, SHARED_KEY_MIC, "shared-key-mic")
+
+typedef enum {
+#define _(v,f,s) IKEV2_AUTH_METHOD_##f = v,
+ foreach_ikev2_auth_method
+#undef _
+} ikev2_auth_method_t;
+
+#define foreach_ikev2_id_type \
+ _( 1, ID_IPV4_ADDR, "ip4-addr") \
+ _( 2, ID_FQDN, "fqdn") \
+ _( 3, ID_RFC822_ADDR, "rfc822") \
+ _( 5, ID_IPV6_ADDR, "ip6-addr") \
+ _( 9, ID_DER_ASN1_DN, "der-asn1-dn") \
+ _(10, ID_DER_ASN1_GN, "der-asn1-gn") \
+ _(11, ID_KEY_ID, "key-id")
+
+typedef enum {
+#define _(v,f,s) IKEV2_ID_TYPE_##f = v,
+ foreach_ikev2_id_type
+#undef _
+} ikev2_id_type_t;
+
+clib_error_t * ikev2_init (vlib_main_t * vm);
+clib_error_t * ikev2_set_local_key(vlib_main_t * vm, u8 * file);
+clib_error_t * ikev2_add_del_profile(vlib_main_t * vm, u8 * name, int is_add);
+clib_error_t * ikev2_set_profile_auth(vlib_main_t * vm, u8 * name,
+ u8 auth_method, u8 * data,
+ u8 data_hex_format);
+clib_error_t * ikev2_set_profile_id(vlib_main_t * vm, u8 * name,
+ u8 id_type, u8 * data, int is_local);
+clib_error_t * ikev2_set_profile_ts(vlib_main_t * vm, u8 * name, u8 protocol_id,
+ u16 start_port, u16 end_port,
+ ip4_address_t start_addr,
+ ip4_address_t end_addr, int is_local);
+/* ikev2_format.c */
+u8 * format_ikev2_auth_method (u8 * s, va_list * args);
+u8 * format_ikev2_id_type (u8 * s, va_list * args);
+u8 * format_ikev2_transform_type (u8 * s, va_list * args);
+u8 * format_ikev2_notify_msg_type (u8 * s, va_list * args);
+u8 * format_ikev2_transform_encr_type(u8 * s, va_list * args);
+u8 * format_ikev2_transform_prf_type(u8 * s, va_list * args);
+u8 * format_ikev2_transform_integ_type(u8 * s, va_list * args);
+u8 * format_ikev2_transform_dh_type(u8 * s, va_list * args);
+u8 * format_ikev2_transform_esn_type(u8 * s, va_list * args);
+u8 * format_ikev2_sa_transform(u8 * s, va_list * args);
+
+uword unformat_ikev2_auth_method (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_id_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_encr_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_prf_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_integ_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_dh_type (unformat_input_t * input, va_list * args);
+uword unformat_ikev2_transform_esn_type (unformat_input_t * input, va_list * args);
+
+#endif /* __included_ikev2_h__ */
+
diff --git a/vnet/vnet/ipsec/ikev2_cli.c b/vnet/vnet/ipsec/ikev2_cli.c
new file mode 100644
index 00000000000..1e6009f0806
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2_cli.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/udp.h>
+#include <vnet/ipsec/ikev2.h>
+#include <vnet/ipsec/ikev2_priv.h>
+
+u8 *
+format_ikev2_id_type_and_data (u8 * s, va_list * args)
+{
+ ikev2_id_t * id = va_arg (*args, ikev2_id_t *);
+
+ if (id->type == 0 || vec_len(id->data) == 0)
+ return format(s, "none");
+
+ s = format(s, "%U", format_ikev2_id_type, id->type);
+
+ if (id->type == IKEV2_ID_TYPE_ID_FQDN ||
+ id->type == IKEV2_ID_TYPE_ID_RFC822_ADDR)
+ {
+ s = format(s, " %v", id->data);
+ }
+ else
+ {
+ s = format(s, " %U", format_hex_bytes, &id->data, (uword) (vec_len(id->data)));
+ }
+
+ return s;
+}
+
+
+static clib_error_t *
+show_ikev2_sa_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_sa_t * sa;
+ ikev2_ts_t * ts;
+ ikev2_child_sa_t * child;
+ ikev2_sa_transform_t * tr;
+
+ pool_foreach (sa, km->sas, ({
+ u8 * s = 0;
+ vlib_cli_output(vm, " iip %U ispi %lx rip %U rspi %lx",
+ format_ip4_address, &sa->iaddr, sa->ispi,
+ format_ip4_address, &sa->raddr, sa->rspi);
+
+ tr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ tr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_PRF);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ tr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ tr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_DH);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ vlib_cli_output(vm, " %v", s);
+ vec_free(s);
+
+ vlib_cli_output(vm, " nonce i:%U\n r:%U",
+ format_hex_bytes, sa->i_nonce, vec_len(sa->i_nonce),
+ format_hex_bytes, sa->r_nonce, vec_len(sa->r_nonce));
+
+ vlib_cli_output(vm, " SK_d %U",
+ format_hex_bytes, sa->sk_d, vec_len(sa->sk_d));
+ vlib_cli_output(vm, " SK_a i:%U\n r:%U",
+ format_hex_bytes, sa->sk_ai, vec_len(sa->sk_ai),
+ format_hex_bytes, sa->sk_ar, vec_len(sa->sk_ar));
+ vlib_cli_output(vm, " SK_e i:%U\n r:%U",
+ format_hex_bytes, sa->sk_ei, vec_len(sa->sk_ei),
+ format_hex_bytes, sa->sk_er, vec_len(sa->sk_er));
+ vlib_cli_output(vm, " SK_p i:%U\n r:%U",
+ format_hex_bytes, sa->sk_pi, vec_len(sa->sk_pi),
+ format_hex_bytes, sa->sk_pr, vec_len(sa->sk_pr));
+
+ vlib_cli_output(vm, " identifier (i) %U",
+ format_ikev2_id_type_and_data, &sa->i_id);
+ vlib_cli_output(vm, " identifier (r) %U",
+ format_ikev2_id_type_and_data, &sa->r_id);
+
+ vec_foreach(child, sa->childs)
+ {
+ vlib_cli_output(vm, " child sa %u:", child - sa->childs);
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_INTEG);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ tr = ikev2_sa_get_td_for_type(child->r_proposals, IKEV2_TRANSFORM_TYPE_ESN);
+ s = format(s, "%U ", format_ikev2_sa_transform, tr);
+
+ vlib_cli_output(vm, " %v", s);
+ vec_free(s);
+
+ vlib_cli_output(vm, " spi(i) %lx spi(r) %lx",
+ child->i_proposals ? child->i_proposals[0].spi : 0,
+ child->r_proposals ? child->r_proposals[0].spi : 0);
+
+ vlib_cli_output(vm, " SK_e i:%U\n r:%U",
+ format_hex_bytes, child->sk_ei, vec_len(child->sk_ei),
+ format_hex_bytes, child->sk_er, vec_len(child->sk_er));
+ vlib_cli_output(vm, " SK_a i:%U\n r:%U",
+ format_hex_bytes, child->sk_ai, vec_len(child->sk_ai),
+ format_hex_bytes, child->sk_ar, vec_len(child->sk_ar));
+ vlib_cli_output(vm, " traffic selectors (i):");
+ vec_foreach(ts, child->tsi)
+ {
+ vlib_cli_output(vm, " %u type %u protocol_id %u addr "
+ "%U - %U port %u - %u",
+ ts - child->tsi,
+ ts->ts_type, ts->protocol_id,
+ format_ip4_address, &ts->start_addr,
+ format_ip4_address, &ts->end_addr,
+ clib_net_to_host_u16( ts->start_port),
+ clib_net_to_host_u16( ts->end_port));
+ }
+ vlib_cli_output(vm, " traffic selectors (r):");
+ vec_foreach(ts, child->tsr)
+ {
+ vlib_cli_output(vm, " %u type %u protocol_id %u addr "
+ "%U - %U port %u - %u",
+ ts - child->tsr,
+ ts->ts_type, ts->protocol_id,
+ format_ip4_address, &ts->start_addr,
+ format_ip4_address, &ts->end_addr,
+ clib_net_to_host_u16( ts->start_port),
+ clib_net_to_host_u16( ts->end_port));
+ }
+ }
+ vlib_cli_output(vm, "");
+ }));
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_ikev2_sa_command, static) = {
+ .path = "show ikev2 sa",
+ .short_help = "show ikev2 sa",
+ .function = show_ikev2_sa_command_fn,
+};
+
+static clib_error_t *
+ikev2_profile_add_del_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u8 * name = 0;
+ clib_error_t * r = 0;
+ u32 id_type;
+ u8 * data = 0;
+ u32 tmp1, tmp2, tmp3;
+ ip4_address_t ip4;
+ ip4_address_t end_addr;
+
+ const char * valid_chars = "a-zA-Z0-9_";
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "add %U", unformat_token, valid_chars, &name))
+ {
+ r = ikev2_add_del_profile(vm, name, 1);
+ goto done;
+ }
+ else if (unformat (line_input, "del %U", unformat_token, valid_chars, &name))
+ {
+ r = ikev2_add_del_profile(vm, name, 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U auth shared-key-mic string %v",
+ unformat_token, valid_chars, &name, &data))
+ {
+ r = ikev2_set_profile_auth(vm, name, IKEV2_AUTH_METHOD_SHARED_KEY_MIC,
+ data, 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U auth shared-key-mic hex %U",
+ unformat_token, valid_chars, &name,
+ unformat_hex_string, &data))
+ {
+ r = ikev2_set_profile_auth(vm, name, IKEV2_AUTH_METHOD_SHARED_KEY_MIC,
+ data, 1);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U auth rsa-sig cert-file %v",
+ unformat_token, valid_chars, &name,
+ &data))
+ {
+ r = ikev2_set_profile_auth(vm, name, IKEV2_AUTH_METHOD_RSA_SIG, data, 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id local %U %U",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type,
+ unformat_ip4_address, &ip4))
+ {
+ data = vec_new(u8, 4);
+ memcpy(data, ip4.as_u8, 4);
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*local*/ 1);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id local %U 0x%U",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type,
+ unformat_hex_string, &data))
+ {
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*local*/ 1);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id local %U %v",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type, &data))
+ {
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*local*/ 1);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id remote %U %U",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type,
+ unformat_ip4_address, &ip4))
+ {
+ data = vec_new(u8, 4);
+ memcpy(data, ip4.as_u8, 4);
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*remote*/ 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id remote %U 0x%U",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type,
+ unformat_hex_string, &data))
+ {
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*remote*/ 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U id remote %U %v",
+ unformat_token, valid_chars, &name,
+ unformat_ikev2_id_type, &id_type, &data))
+ {
+ r = ikev2_set_profile_id(vm, name, (u8) id_type, data, /*remote*/ 0);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U traffic-selector local "
+ "ip-range %U - %U port-range %u - %u protocol %u",
+ unformat_token, valid_chars, &name,
+ unformat_ip4_address, &ip4,
+ unformat_ip4_address, &end_addr,
+ &tmp1, &tmp2, &tmp3))
+ {
+ r = ikev2_set_profile_ts(vm, name, (u8)tmp3, (u16)tmp1, (u16)tmp2,
+ ip4, end_addr, /*local*/ 1);
+ goto done;
+ }
+ else if (unformat (line_input, "set %U traffic-selector remote "
+ "ip-range %U - %U port-range %u - %u protocol %u",
+ unformat_token, valid_chars, &name,
+ unformat_ip4_address, &ip4,
+ unformat_ip4_address, &end_addr,
+ &tmp1, &tmp2, &tmp3))
+ {
+ r = ikev2_set_profile_ts(vm, name, (u8)tmp3, (u16)tmp1, (u16)tmp2,
+ ip4, end_addr, /*remote*/ 0);
+ goto done;
+ }
+ else
+ break;
+ }
+
+ r = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+
+done:
+ vec_free(name);
+ vec_free(data);
+ unformat_free (line_input);
+ return r;
+}
+
+VLIB_CLI_COMMAND (ikev2_profile_add_del_command, static) = {
+ .path = "ikev2 profile",
+ .short_help =
+ "ikev2 profile [add|del] <id>\n"
+ "ikev2 profile set <id> auth [rsa-sig|shared-key-mic] [cert-file|string|hex]"
+ " <data>\n"
+ "ikev2 profile set <id> id <local|remote> <type> <data>\n"
+ "ikev2 profile set <id> traffic-selector <local|remote> ip-range "
+ "<start-addr> - <end-addr> port-range <start-port> - <end-port> "
+ "protocol <protocol-number>",
+ .function = ikev2_profile_add_del_command_fn,
+};
+
+static clib_error_t *
+show_ikev2_profile_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ikev2_main_t * km = &ikev2_main;
+ ikev2_profile_t * p;
+
+ pool_foreach (p, km->profiles, ({
+ vlib_cli_output(vm, "profile %v", p->name);
+
+ if (p->auth.data)
+ {
+ if (p->auth.hex)
+ vlib_cli_output(vm, " auth-method %U auth data 0x%U",
+ format_ikev2_auth_method, p->auth.method,
+ format_hex_bytes, p->auth.data, vec_len(p->auth.data));
+ else
+ vlib_cli_output(vm, " auth-method %U auth data %v",
+ format_ikev2_auth_method, p->auth.method, p->auth.data);
+ }
+
+ if (p->loc_id.data)
+ {
+ if (p->loc_id.type == IKEV2_ID_TYPE_ID_IPV4_ADDR)
+ vlib_cli_output(vm, " local id-type %U data %U",
+ format_ikev2_id_type, p->loc_id.type,
+ format_ip4_address, p->loc_id.data);
+ else if (p->loc_id.type == IKEV2_ID_TYPE_ID_KEY_ID)
+ vlib_cli_output(vm, " local id-type %U data 0x%U",
+ format_ikev2_id_type, p->loc_id.type,
+ format_hex_bytes, p->loc_id.data,
+ vec_len(p->loc_id.data));
+ else
+ vlib_cli_output(vm, " local id-type %U data %v",
+ format_ikev2_id_type, p->loc_id.type, p->loc_id.data);
+ }
+
+ if (p->rem_id.data)
+ {
+ if (p->rem_id.type == IKEV2_ID_TYPE_ID_IPV4_ADDR)
+ vlib_cli_output(vm, " remote id-type %U data %U",
+ format_ikev2_id_type, p->rem_id.type,
+ format_ip4_address, p->rem_id.data);
+ else if (p->rem_id.type == IKEV2_ID_TYPE_ID_KEY_ID)
+ vlib_cli_output(vm, " remote id-type %U data 0x%U",
+ format_ikev2_id_type, p->rem_id.type,
+ format_hex_bytes, p->rem_id.data,
+ vec_len(p->rem_id.data));
+ else
+ vlib_cli_output(vm, " remote id-type %U data %v",
+ format_ikev2_id_type, p->rem_id.type, p->rem_id.data);
+ }
+
+ if (p->loc_ts.end_addr.as_u32)
+ vlib_cli_output(vm, " local traffic-selector addr %U - %U port %u - %u"
+ " protocol %u",
+ format_ip4_address, &p->loc_ts.start_addr,
+ format_ip4_address, &p->loc_ts.end_addr,
+ p->loc_ts.start_port, p->loc_ts.end_port,
+ p->loc_ts.protocol_id);
+
+ if (p->rem_ts.end_addr.as_u32)
+ vlib_cli_output(vm, " remote traffic-selector addr %U - %U port %u - %u"
+ " protocol %u",
+ format_ip4_address, &p->rem_ts.start_addr,
+ format_ip4_address, &p->rem_ts.end_addr,
+ p->rem_ts.start_port, p->rem_ts.end_port,
+ p->rem_ts.protocol_id);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_ikev2_profile_command, static) = {
+ .path = "show ikev2 profile",
+ .short_help = "show ikev2 profile",
+ .function = show_ikev2_profile_command_fn,
+};
+
+static clib_error_t *
+set_ikev2_local_key_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ clib_error_t * r = 0;
+ u8 * data = 0;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%v", &data))
+ {
+ r = ikev2_set_local_key(vm, data);
+ goto done;
+ }
+ else
+ break;
+ }
+
+ r = clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+
+done:
+ vec_free(data);
+ unformat_free (line_input);
+ return r;
+}
+
+VLIB_CLI_COMMAND (set_ikev2_local_key_command, static) = {
+ .path = "set ikev2 local key",
+ .short_help =
+ "set ikev2 local key <file>",
+ .function = set_ikev2_local_key_command_fn,
+};
+
+clib_error_t *
+ikev2_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ikev2_cli_init);
diff --git a/vnet/vnet/ipsec/ikev2_crypto.c b/vnet/vnet/ipsec/ikev2_crypto.c
new file mode 100644
index 00000000000..b8dce034e3f
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2_crypto.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/udp.h>
+#include <vnet/ipsec/ikev2.h>
+#include <vnet/ipsec/ikev2_priv.h>
+#include <openssl/obj_mac.h>
+#include <openssl/ec.h>
+#include <openssl/x509.h>
+#include <openssl/pem.h>
+#include <openssl/bn.h>
+
+/* from RFC7296 */
+static const char modp_dh_768_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A63A3620FFFFFFFFFFFFFFFF";
+static const char modp_dh_768_generator[] = "02";
+
+static const char modp_dh_1024_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE65381"
+"FFFFFFFFFFFFFFFF";
+static const char modp_dh_1024_generator[] = "02";
+
+/* from RFC3526 */
+static const char modp_dh_1536_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D"
+"C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F"
+"83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA237327FFFFFFFFFFFFFFFF";
+static const char modp_dh_1536_generator[] = "02";
+
+static const char modp_dh_2048_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D"
+"C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F"
+"83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B"
+"E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9"
+"DE2BCBF6955817183995497CEA956AE515D2261898FA0510"
+"15728E5A8AACAA68FFFFFFFFFFFFFFFF";
+static const char modp_dh_2048_generator[] = "02";
+
+static const char modp_dh_3072_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D"
+"C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F"
+"83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B"
+"E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9"
+"DE2BCBF6955817183995497CEA956AE515D2261898FA0510"
+"15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64"
+"ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7"
+"ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B"
+"F12FFA06D98A0864D87602733EC86A64521F2B18177B200C"
+"BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31"
+"43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF";
+static const char modp_dh_3072_generator[] = "02";
+
+static const char modp_dh_4096_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D"
+"C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F"
+"83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B"
+"E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9"
+"DE2BCBF6955817183995497CEA956AE515D2261898FA0510"
+"15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64"
+"ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7"
+"ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B"
+"F12FFA06D98A0864D87602733EC86A64521F2B18177B200C"
+"BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31"
+"43DB5BFCE0FD108E4B82D120A92108011A723C12A787E6D7"
+"88719A10BDBA5B2699C327186AF4E23C1A946834B6150BDA"
+"2583E9CA2AD44CE8DBBBC2DB04DE8EF92E8EFC141FBECAA6"
+"287C59474E6BC05D99B2964FA090C3A2233BA186515BE7ED"
+"1F612970CEE2D7AFB81BDD762170481CD0069127D5B05AA9"
+"93B4EA988D8FDDC186FFB7DC90A6C08F4DF435C934063199"
+"FFFFFFFFFFFFFFFF";
+static const char modp_dh_4096_generator[] = "02";
+
+static const char modp_dh_6144_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD129024E08"
+"8A67CC74020BBEA63B139B22514A08798E3404DDEF9519B3CD3A431B"
+"302B0A6DF25F14374FE1356D6D51C245E485B576625E7EC6F44C42E9"
+"A637ED6B0BFF5CB6F406B7EDEE386BFB5A899FA5AE9F24117C4B1FE6"
+"49286651ECE45B3DC2007CB8A163BF0598DA48361C55D39A69163FA8"
+"FD24CF5F83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA18217C32905E462E36CE3BE39E772C"
+"180E86039B2783A2EC07A28FB5C55DF06F4C52C9DE2BCBF695581718"
+"3995497CEA956AE515D2261898FA051015728E5A8AAAC42DAD33170D"
+"04507A33A85521ABDF1CBA64ECFB850458DBEF0A8AEA71575D060C7D"
+"B3970F85A6E1E4C7ABF5AE8CDB0933D71E8C94E04A25619DCEE3D226"
+"1AD2EE6BF12FFA06D98A0864D87602733EC86A64521F2B18177B200C"
+"BBE117577A615D6C770988C0BAD946E208E24FA074E5AB3143DB5BFC"
+"E0FD108E4B82D120A92108011A723C12A787E6D788719A10BDBA5B26"
+"99C327186AF4E23C1A946834B6150BDA2583E9CA2AD44CE8DBBBC2DB"
+"04DE8EF92E8EFC141FBECAA6287C59474E6BC05D99B2964FA090C3A2"
+"233BA186515BE7ED1F612970CEE2D7AFB81BDD762170481CD0069127"
+"D5B05AA993B4EA988D8FDDC186FFB7DC90A6C08F4DF435C934028492"
+"36C3FAB4D27C7026C1D4DCB2602646DEC9751E763DBA37BDF8FF9406"
+"AD9E530EE5DB382F413001AEB06A53ED9027D831179727B0865A8918"
+"DA3EDBEBCF9B14ED44CE6CBACED4BB1BDB7F1447E6CC254B33205151"
+"2BD7AF426FB8F401378CD2BF5983CA01C64B92ECF032EA15D1721D03"
+"F482D7CE6E74FEF6D55E702F46980C82B5A84031900B1C9E59E7C97F"
+"BEC7E8F323A97A7E36CC88BE0F1D45B7FF585AC54BD407B22B4154AA"
+"CC8F6D7EBF48E1D814CC5ED20F8037E0A79715EEF29BE32806A1D58B"
+"B7C5DA76F550AA3D8A1FBFF0EB19CCB1A313D55CDA56C9EC2EF29632"
+"387FE8D76E3C0468043E8F663F4860EE12BF2D5B0B7474D6E694F91E"
+"6DCC4024FFFFFFFFFFFFFFFF";
+static const char modp_dh_6144_generator[] = "02";
+
+static const char modp_dh_8192_prime[] =
+"FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1"
+"29024E088A67CC74020BBEA63B139B22514A08798E3404DD"
+"EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245"
+"E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED"
+"EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D"
+"C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F"
+"83655D23DCA3AD961C62F356208552BB9ED529077096966D"
+"670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B"
+"E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9"
+"DE2BCBF6955817183995497CEA956AE515D2261898FA0510"
+"15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64"
+"ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7"
+"ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B"
+"F12FFA06D98A0864D87602733EC86A64521F2B18177B200C"
+"BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31"
+"43DB5BFCE0FD108E4B82D120A92108011A723C12A787E6D7"
+"88719A10BDBA5B2699C327186AF4E23C1A946834B6150BDA"
+"2583E9CA2AD44CE8DBBBC2DB04DE8EF92E8EFC141FBECAA6"
+"287C59474E6BC05D99B2964FA090C3A2233BA186515BE7ED"
+"1F612970CEE2D7AFB81BDD762170481CD0069127D5B05AA9"
+"93B4EA988D8FDDC186FFB7DC90A6C08F4DF435C934028492"
+"36C3FAB4D27C7026C1D4DCB2602646DEC9751E763DBA37BD"
+"F8FF9406AD9E530EE5DB382F413001AEB06A53ED9027D831"
+"179727B0865A8918DA3EDBEBCF9B14ED44CE6CBACED4BB1B"
+"DB7F1447E6CC254B332051512BD7AF426FB8F401378CD2BF"
+"5983CA01C64B92ECF032EA15D1721D03F482D7CE6E74FEF6"
+"D55E702F46980C82B5A84031900B1C9E59E7C97FBEC7E8F3"
+"23A97A7E36CC88BE0F1D45B7FF585AC54BD407B22B4154AA"
+"CC8F6D7EBF48E1D814CC5ED20F8037E0A79715EEF29BE328"
+"06A1D58BB7C5DA76F550AA3D8A1FBFF0EB19CCB1A313D55C"
+"DA56C9EC2EF29632387FE8D76E3C0468043E8F663F4860EE"
+"12BF2D5B0B7474D6E694F91E6DBE115974A3926F12FEE5E4"
+"38777CB6A932DF8CD8BEC4D073B931BA3BC832B68D9DD300"
+"741FA7BF8AFC47ED2576F6936BA424663AAB639C5AE4F568"
+"3423B4742BF1C978238F16CBE39D652DE3FDB8BEFC848AD9"
+"22222E04A4037C0713EB57A81A23F0C73473FC646CEA306B"
+"4BCBC8862F8385DDFA9D4B7FA2C087E879683303ED5BDD3A"
+"062B3CF5B3A278A66D2A13F83F44F82DDF310EE074AB6A36"
+"4597E899A0255DC164F31CC50846851DF9AB48195DED7EA1"
+"B1D510BD7EE74D73FAF36BC31ECFA268359046F4EB879F92"
+"4009438B481C6CD7889A002ED5EE382BC9190DA6FC026E47"
+"9558E4475677E9AA9E3050E2765694DFC81F56E880B96E71"
+"60C980DD98EDD3DFFFFFFFFFFFFFFFFF";
+static const char modp_dh_8192_generator[] = "02";
+
+/* from RFC5114 */
+static const char modp_dh_1024_160_prime[] =
+"B10B8F96A080E01DDE92DE5EAE5D54EC52C99FBCFB06A3C6"
+"9A6A9DCA52D23B616073E28675A23D189838EF1E2EE652C0"
+"13ECB4AEA906112324975C3CD49B83BFACCBDD7D90C4BD70"
+"98488E9C219A73724EFFD6FAE5644738FAA31A4FF55BCCC0"
+"A151AF5F0DC8B4BD45BF37DF365C1A65E68CFDA76D4DA708"
+"DF1FB2BC2E4A4371";
+static const char modp_dh_1024_160_generator[] =
+"A4D1CBD5C3FD34126765A442EFB99905F8104DD258AC507F"
+"D6406CFF14266D31266FEA1E5C41564B777E690F5504F213"
+"160217B4B01B886A5E91547F9E2749F4D7FBD7D3B9A92EE1"
+"909D0D2263F80A76A6A24C087A091F531DBF0A0169B6A28A"
+"D662A4D18E73AFA32D779D5918D08BC8858F4DCEF97C2A24"
+"855E6EEB22B3B2E5";
+
+static const char modp_dh_2048_224_prime[] =
+"AD107E1E9123A9D0D660FAA79559C51FA20D64E5683B9FD1"
+"B54B1597B61D0A75E6FA141DF95A56DBAF9A3C407BA1DF15"
+"EB3D688A309C180E1DE6B85A1274A0A66D3F8152AD6AC212"
+"9037C9EDEFDA4DF8D91E8FEF55B7394B7AD5B7D0B6C12207"
+"C9F98D11ED34DBF6C6BA0B2C8BBC27BE6A00E0A0B9C49708"
+"B3BF8A317091883681286130BC8985DB1602E714415D9330"
+"278273C7DE31EFDC7310F7121FD5A07415987D9ADC0A486D"
+"CDF93ACC44328387315D75E198C641A480CD86A1B9E587E8"
+"BE60E69CC928B2B9C52172E413042E9B23F10B0E16E79763"
+"C9B53DCF4BA80A29E3FB73C16B8E75B97EF363E2FFA31F71"
+"CF9DE5384E71B81C0AC4DFFE0C10E64F";
+static const char modp_dh_2048_224_generator[] =
+"AC4032EF4F2D9AE39DF30B5C8FFDAC506CDEBE7B89998CAF"
+"74866A08CFE4FFE3A6824A4E10B9A6F0DD921F01A70C4AFA"
+"AB739D7700C29F52C57DB17C620A8652BE5E9001A8D66AD7"
+"C17669101999024AF4D027275AC1348BB8A762D0521BC98A"
+"E247150422EA1ED409939D54DA7460CDB5F6C6B250717CBE"
+"F180EB34118E98D119529A45D6F834566E3025E316A330EF"
+"BB77A86F0C1AB15B051AE3D428C8F8ACB70A8137150B8EEB"
+"10E183EDD19963DDD9E263E4770589EF6AA21E7F5F2FF381"
+"B539CCE3409D13CD566AFBB48D6C019181E1BCFE94B30269"
+"EDFE72FE9B6AA4BD7B5A0F1C71CFFF4C19C418E1F6EC0179"
+"81BC087F2A7065B384B890D3191F2BFA";
+
+static const char modp_dh_2048_256_prime[] =
+"87A8E61DB4B6663CFFBBD19C651959998CEEF608660DD0F2"
+"5D2CEED4435E3B00E00DF8F1D61957D4FAF7DF4561B2AA30"
+"16C3D91134096FAA3BF4296D830E9A7C209E0C6497517ABD"
+"5A8A9D306BCF67ED91F9E6725B4758C022E0B1EF4275BF7B"
+"6C5BFC11D45F9088B941F54EB1E59BB8BC39A0BF12307F5C"
+"4FDB70C581B23F76B63ACAE1CAA6B7902D52526735488A0E"
+"F13C6D9A51BFA4AB3AD8347796524D8EF6A167B5A41825D9"
+"67E144E5140564251CCACB83E6B486F6B3CA3F7971506026"
+"C0B857F689962856DED4010ABD0BE621C3A3960A54E710C3"
+"75F26375D7014103A4B54330C198AF126116D2276E11715F"
+"693877FAD7EF09CADB094AE91E1A1597";
+static const char modp_dh_2048_256_generator[] =
+"3FB32C9B73134D0B2E77506660EDBD484CA7B18F21EF2054"
+"07F4793A1A0BA12510DBC15077BE463FFF4FED4AAC0BB555"
+"BE3A6C1B0C6B47B1BC3773BF7E8C6F62901228F8C28CBB18"
+"A55AE31341000A650196F931C77A57F2DDF463E5E9EC144B"
+"777DE62AAAB8A8628AC376D282D6ED3864E67982428EBC83"
+"1D14348F6F2F9193B5045AF2767164E1DFC967C1FB3F2E55"
+"A4BD1BFFE83B9C80D052B985D182EA0ADB2A3B7313D3FE14"
+"C8484B1E052588B9B7D2BBD2DF016199ECD06E1557CD0915"
+"B3353BBB64E0EC377FD028370DF92B52C7891428CDC67EB6"
+"184B523D1DB246C32F63078490F00EF8D647D148D4795451"
+"5E2327CFEF98C582664B4C0F6CC41659";
+
+v8 *
+ikev2_calc_prf(ikev2_sa_transform_t * tr, v8 * key, v8 * data)
+{
+ HMAC_CTX ctx;
+ v8 * prf;
+ unsigned int len = 0;
+
+ prf = vec_new(u8, tr->key_trunc);
+ HMAC_CTX_init(&ctx);
+ HMAC_Init_ex(&ctx, key, vec_len(key), tr->md, NULL);
+ HMAC_Update(&ctx, data, vec_len(data));
+ HMAC_Final(&ctx, prf, &len);
+ HMAC_CTX_cleanup(&ctx);
+
+ ASSERT(len == tr->key_trunc);
+
+ return prf;
+}
+u8 *
+ikev2_calc_prfplus(ikev2_sa_transform_t * tr, u8 * key, u8 * seed, int len)
+{
+ v8 * t = 0, * s = 0, * tmp = 0, * ret = 0;
+ u8 x = 0;
+
+ /* prf+ (K,S) = T1 | T2 | T3 | T4 | ...
+
+ where:
+ T1 = prf (K, S | 0x01)
+ T2 = prf (K, T1 | S | 0x02)
+ T3 = prf (K, T2 | S | 0x03)
+ T4 = prf (K, T3 | S | 0x04)
+ */
+
+ while (vec_len(ret) < len && x < 255) {
+ if (t) {
+ vec_append(s, t);
+ vec_free(t);
+ }
+
+ vec_append(s, seed);
+ vec_add2(s, tmp, 1);
+ *tmp = x + 1;
+ t = ikev2_calc_prf(tr, key, s);
+ vec_append(ret, t);
+ vec_free(s);
+ x++;
+ }
+
+ vec_free(t);
+
+ if (x == 255) {
+ vec_free(ret);
+ }
+
+ return ret;
+}
+
+v8 *
+ikev2_calc_integr(ikev2_sa_transform_t * tr, v8 * key, u8 * data, int len)
+{
+ v8 * r;
+ HMAC_CTX hctx;
+ unsigned int l;
+
+ ASSERT(tr->type == IKEV2_TRANSFORM_TYPE_INTEG);
+
+ r = vec_new(u8, tr->key_len);
+
+ /* verify integrity of data */
+ HMAC_CTX_init(&hctx);
+ HMAC_Init(&hctx, key, vec_len(key), tr->md);
+ HMAC_Update(&hctx, (const u8 *) data, len);
+ HMAC_Final(&hctx, r, &l);
+ HMAC_CTX_cleanup(&hctx);
+
+ ASSERT(l == tr->key_len);
+
+ return r;
+}
+
+v8 *
+ikev2_decrypt_data(ikev2_sa_t * sa, u8 * data, int len)
+{
+ EVP_CIPHER_CTX ctx;
+ v8 * r;
+ int out_len = 0, block_size;
+ ikev2_sa_transform_t * tr_encr;
+
+ tr_encr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ block_size = tr_encr->block_size;
+
+ /* check if data is multiplier of cipher block size */
+ if (len % block_size) {
+ clib_warning("wrong data length");
+ return 0;
+ }
+
+ EVP_CIPHER_CTX_init(&ctx);
+ r = vec_new(u8, len - block_size);
+ EVP_DecryptInit_ex(&ctx, tr_encr->cipher, NULL, sa->sk_ei, data);
+ EVP_DecryptUpdate(&ctx, r, &out_len, data+block_size, len-block_size);
+ EVP_DecryptFinal_ex(&ctx, r + out_len, &out_len);
+
+ /* remove padding */
+ _vec_len(r) -= r[vec_len(r)-1] + 1;
+
+ EVP_CIPHER_CTX_cleanup(&ctx);
+ return r;
+}
+
+int
+ikev2_encrypt_data(ikev2_sa_t * sa, v8 * src, u8 * dst)
+{
+ EVP_CIPHER_CTX ctx;
+ int out_len;
+ int bs;
+ ikev2_sa_transform_t * tr_encr;
+
+ tr_encr = ikev2_sa_get_td_for_type(sa->r_proposals, IKEV2_TRANSFORM_TYPE_ENCR);
+ bs = tr_encr->block_size;
+
+ /* generate IV */
+ RAND_bytes(dst, bs);
+
+ EVP_CIPHER_CTX_init(&ctx);
+
+ EVP_EncryptInit_ex(&ctx, tr_encr->cipher, NULL, sa->sk_er, dst /* dst */ );
+ EVP_EncryptUpdate(&ctx, dst + bs, &out_len, src, vec_len(src));
+
+ EVP_CIPHER_CTX_cleanup(&ctx);
+
+ ASSERT(vec_len(src) == out_len);
+
+ return out_len + bs;
+}
+
+void
+ikev2_generate_dh(ikev2_sa_t * sa, ikev2_sa_transform_t * t)
+{
+ int r;
+
+ if (t->dh_group == IKEV2_DH_GROUP_MODP)
+ {
+ DH * dh = DH_new();
+ BN_hex2bn(&dh->p, t->dh_p);
+ BN_hex2bn(&dh->g, t->dh_g);
+ DH_generate_key(dh);
+
+ sa->r_dh_data = vec_new(u8, t->key_len);
+ r = BN_bn2bin(dh->pub_key, sa->r_dh_data);
+ ASSERT(r == t->key_len);
+
+ BIGNUM *ex;
+ sa->dh_shared_key = vec_new(u8, t->key_len);
+ ex = BN_bin2bn(sa->i_dh_data, vec_len(sa->i_dh_data) , NULL);
+ r = DH_compute_key(sa->dh_shared_key, ex, dh);
+ ASSERT(r == t->key_len);
+ BN_clear_free(ex);
+ DH_free(dh);
+ }
+ else if (t->dh_group == IKEV2_DH_GROUP_ECP)
+ {
+ EC_KEY * ec = EC_KEY_new_by_curve_name(t->nid);
+ ASSERT(ec);
+
+ EC_KEY_generate_key(ec);
+
+ const EC_POINT * r_point = EC_KEY_get0_public_key(ec);
+ const EC_GROUP * group = EC_KEY_get0_group(ec);
+ BIGNUM * x = NULL, * y = NULL;
+ BN_CTX * bn_ctx = BN_CTX_new();
+ u16 x_off, y_off, len;
+ EC_POINT * i_point = EC_POINT_new(group);
+ EC_POINT * shared_point = EC_POINT_new(group);
+
+ x = BN_new();
+ y = BN_new();
+ len = t->key_len / 2;
+
+ EC_POINT_get_affine_coordinates_GFp(group, r_point, x, y, bn_ctx);
+ sa->r_dh_data = vec_new(u8, t->key_len);
+ x_off = len - BN_num_bytes(x);
+ memset(sa->r_dh_data, 0, x_off);
+ BN_bn2bin(x, sa->r_dh_data + x_off);
+ y_off = t->key_len - BN_num_bytes(y);
+ memset(sa->r_dh_data + len, 0, y_off - len);
+ BN_bn2bin(y, sa->r_dh_data + y_off);
+
+ x = BN_bin2bn(sa->i_dh_data, len, x);
+ y = BN_bin2bn(sa->i_dh_data + len, len, y);
+ EC_POINT_set_affine_coordinates_GFp(group, i_point, x, y, bn_ctx);
+ sa->dh_shared_key = vec_new(u8, t->key_len);
+ EC_POINT_mul(group, shared_point, NULL, i_point, EC_KEY_get0_private_key(ec), NULL);
+ EC_POINT_get_affine_coordinates_GFp(group, shared_point, x, y, bn_ctx);
+ x_off = len - BN_num_bytes(x);
+ memset(sa->dh_shared_key, 0, x_off);
+ BN_bn2bin(x, sa->dh_shared_key + x_off);
+ y_off = t->key_len - BN_num_bytes(y);
+ memset(sa->dh_shared_key + len, 0, y_off - len);
+ BN_bn2bin(y, sa->dh_shared_key + y_off);
+
+ EC_KEY_free(ec);
+ BN_free(x);
+ BN_free(y);
+ BN_CTX_free(bn_ctx);
+ EC_POINT_free(i_point);
+ EC_POINT_free(shared_point);
+ }
+}
+
+int
+ikev2_verify_sign (EVP_PKEY *pkey, u8 * sigbuf, u8 * data)
+{
+ EVP_MD_CTX md_ctx;
+
+ EVP_VerifyInit(&md_ctx, EVP_sha1());
+ EVP_VerifyUpdate(&md_ctx, data, vec_len(data));
+
+ return EVP_VerifyFinal(&md_ctx, sigbuf, vec_len(sigbuf), pkey);
+}
+
+u8 *
+ikev2_calc_sign (EVP_PKEY *pkey, u8 * data)
+{
+ EVP_MD_CTX md_ctx;
+ unsigned int sig_len = 0;
+ u8 * sign;
+
+ EVP_SignInit(&md_ctx, EVP_sha1());
+ EVP_SignUpdate(&md_ctx, data, vec_len(data));
+ /* get sign len */
+ EVP_SignFinal(&md_ctx, NULL, &sig_len, pkey);
+ sign = vec_new(u8, sig_len);
+ /* calc sign */
+ EVP_SignFinal(&md_ctx, sign, &sig_len, pkey);
+
+ return sign;
+}
+
+EVP_PKEY *
+ikev2_load_cert_file (u8 * file)
+{
+ FILE * fp;
+ X509 * x509;
+ EVP_PKEY * pkey = NULL;
+
+ fp = fopen((char *)file, "r");
+ if (!fp)
+ {
+ clib_warning("open %s failed", file);
+ goto end;
+ }
+
+ x509 = PEM_read_X509(fp, NULL, NULL, NULL);
+ fclose(fp);
+ if (x509 == NULL)
+ {
+ clib_warning("read cert %s failed", file);
+ goto end;
+ }
+
+ pkey = X509_get_pubkey(x509);
+ if (pkey == NULL)
+ clib_warning("get pubkey %s failed", file);
+
+end:
+ return pkey;
+}
+
+EVP_PKEY *
+ikev2_load_key_file (u8 * file)
+{
+ FILE *fp;
+ EVP_PKEY * pkey = NULL;
+
+ fp = fopen((char *)file, "r");
+ if (!fp)
+ {
+ clib_warning("open %s failed", file);
+ goto end;
+ }
+
+ pkey = PEM_read_PrivateKey(fp, NULL, NULL, NULL);
+ fclose(fp);
+ if (pkey == NULL)
+ clib_warning("read %s failed", file);
+
+end:
+ return pkey;
+}
+
+void
+ikev2_crypto_init (ikev2_main_t * km)
+{
+ ikev2_sa_transform_t * tr;
+
+ /* vector of supported transforms - in order of preference */
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_ENCR;
+ tr->encr_type = IKEV2_TRANSFORM_ENCR_TYPE_AES_CBC;
+ tr->key_len = 256/8;
+ tr->block_size = 128/8;
+ tr->cipher = EVP_aes_256_cbc();
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_ENCR;
+ tr->encr_type = IKEV2_TRANSFORM_ENCR_TYPE_AES_CBC;
+ tr->key_len = 192/8;
+ tr->block_size = 128/8;
+ tr->cipher = EVP_aes_192_cbc();
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_ENCR;
+ tr->encr_type = IKEV2_TRANSFORM_ENCR_TYPE_AES_CBC;
+ tr->key_len = 128/8;
+ tr->block_size = 128/8;
+ tr->cipher = EVP_aes_128_cbc();
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_PRF;
+ tr->prf_type = IKEV2_TRANSFORM_PRF_TYPE_PRF_HMAC_SHA1;
+ tr->key_len = 160/8;
+ tr->key_trunc = 160/8;
+ tr->md = EVP_sha1();
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_INTEG;
+ tr->integ_type = IKEV2_TRANSFORM_INTEG_TYPE_AUTH_HMAC_SHA1_96;
+ tr->key_len = 160/8;
+ tr->key_trunc = 96/8;
+ tr->md = EVP_sha1();
+
+#if defined(OPENSSL_NO_CISCO_FECDH)
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_BRAINPOOL_512;
+ tr->key_len = (512 * 2)/8;
+ tr->nid = NID_brainpoolP512r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_BRAINPOOL_384;
+ tr->key_len = (384 * 2)/8;
+ tr->nid = NID_brainpoolP384r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_BRAINPOOL_256;
+ tr->key_len = (256 * 2)/8;
+ tr->nid = NID_brainpoolP256r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_BRAINPOOL_224;
+ tr->key_len = (224 * 2)/8;
+ tr->nid = NID_brainpoolP224r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_ECP_224;
+ tr->key_len = (224 * 2)/8;
+ tr->nid = NID_secp224r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+#endif
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_ECP_521;
+ tr->key_len = (528 * 2)/8;
+ tr->nid = NID_secp521r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_ECP_384;
+ tr->key_len = (384 * 2)/8;
+ tr->nid = NID_secp384r1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_ECP_256;
+ tr->key_len = (256 * 2)/8;
+ tr->nid = NID_X9_62_prime256v1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_ECP_192;
+ tr->key_len = (192 * 2)/8;
+ tr->nid = NID_X9_62_prime192v1;
+ tr->dh_group = IKEV2_DH_GROUP_ECP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_2048_256;
+ tr->key_len = 2048/8;
+ tr->dh_p = (const char *) &modp_dh_2048_256_prime;
+ tr->dh_g = (const char *) &modp_dh_2048_256_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_2048_224;
+ tr->key_len = 2048/8;
+ tr->dh_p = (const char *) &modp_dh_2048_224_prime;
+ tr->dh_g = (const char *) &modp_dh_2048_224_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_1024_160;
+ tr->key_len = 1024/8;
+ tr->dh_p = (const char *) &modp_dh_1024_160_prime;
+ tr->dh_g = (const char *) &modp_dh_1024_160_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_8192;
+ tr->key_len = 8192/8;
+ tr->dh_p = (const char *) &modp_dh_8192_prime;
+ tr->dh_g = (const char *) &modp_dh_8192_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_6144;
+ tr->key_len = 6144/8;
+ tr->dh_p = (const char *) &modp_dh_6144_prime;
+ tr->dh_g = (const char *) &modp_dh_6144_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_4096;
+ tr->key_len = 4096/8;
+ tr->dh_p = (const char *) &modp_dh_4096_prime;
+ tr->dh_g = (const char *) &modp_dh_4096_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_3072;
+ tr->key_len = 3072/8;
+ tr->dh_p = (const char *) &modp_dh_3072_prime;
+ tr->dh_g = (const char *) &modp_dh_3072_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_2048;
+ tr->key_len = 2048/8;
+ tr->dh_p = (const char *) &modp_dh_2048_prime;
+ tr->dh_g = (const char *) &modp_dh_2048_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_1536;
+ tr->key_len = 1536/8;
+ tr->dh_p = (const char *) &modp_dh_1536_prime;
+ tr->dh_g = (const char *) &modp_dh_1536_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_1024;
+ tr->key_len = 1024/8;
+ tr->dh_p = (const char *) &modp_dh_1024_prime;
+ tr->dh_g = (const char *) &modp_dh_1024_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_DH;
+ tr->dh_type = IKEV2_TRANSFORM_DH_TYPE_MODP_768;
+ tr->key_len = 768/8;
+ tr->dh_p = (const char *) &modp_dh_768_prime;
+ tr->dh_g = (const char *) &modp_dh_768_generator;
+ tr->dh_group = IKEV2_DH_GROUP_MODP;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_ESN;
+ tr->esn_type = IKEV2_TRANSFORM_ESN_TYPE_ESN;
+
+ vec_add2(km->supported_transforms, tr, 1);
+ tr->type = IKEV2_TRANSFORM_TYPE_ESN;
+ tr->esn_type = IKEV2_TRANSFORM_ESN_TYPE_NO_ESN;
+}
+
+
diff --git a/vnet/vnet/ipsec/ikev2_format.c b/vnet/vnet/ipsec/ikev2_format.c
new file mode 100644
index 00000000000..b5f047f3f79
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2_format.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+#include <vnet/interface.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/ikev2.h>
+#include <vnet/ipsec/ikev2_priv.h>
+
+u8 * format_ikev2_sa_transform(u8 * s, va_list * args)
+{
+ ikev2_sa_transform_t * tr = va_arg (*args, ikev2_sa_transform_t *);
+
+ if (!tr)
+ return s;
+
+ if (tr->type >= IKEV2_TRANSFORM_NUM_TYPES)
+ return s;
+
+ s = format(s,"%U:", format_ikev2_transform_type, tr->type);
+
+ switch (tr->type)
+ {
+ case IKEV2_TRANSFORM_TYPE_ENCR:
+ s = format(s, "%U", format_ikev2_transform_encr_type, tr->encr_type);
+ break;
+ case IKEV2_TRANSFORM_TYPE_PRF:
+ s = format(s, "%U", format_ikev2_transform_prf_type, tr->prf_type);
+ break;
+ case IKEV2_TRANSFORM_TYPE_INTEG:
+ s = format(s, "%U", format_ikev2_transform_integ_type, tr->integ_type);
+ break;
+ case IKEV2_TRANSFORM_TYPE_DH:
+ s = format(s, "%U", format_ikev2_transform_dh_type, tr->dh_type);
+ break;
+ case IKEV2_TRANSFORM_TYPE_ESN:
+ s = format(s, "%U", format_ikev2_transform_esn_type, tr->esn_type);
+ break;
+ default:
+ break;
+ }
+
+ if (tr->type == IKEV2_TRANSFORM_TYPE_ENCR &&
+ tr->encr_type == IKEV2_TRANSFORM_ENCR_TYPE_AES_CBC && tr->key_len)
+ s = format(s, "-%u", tr->key_len * 8);
+ else if (vec_len(tr->attrs) == 4 && tr->attrs[0] == 0x80 && tr->attrs[1] == 0x0e)
+ s = format(s, "-%u", tr->attrs[2] * 256 + tr->attrs[3]);
+ else if (vec_len(tr->attrs))
+ s = format(s, "(unknown attr %U)", format_hex_bytes,
+ tr->attrs, vec_len(tr->attrs));
+
+ return s;
+}
+
+#define MACRO_FORMAT(lc) \
+u8 * format_ikev2_##lc (u8 * s, va_list * args) \
+{ \
+ u32 i = va_arg (*args, u32); \
+ char * t = 0; \
+ switch (i) { \
+ foreach_ikev2_##lc \
+ default: \
+ return format (s, "unknown (%u)", i); \
+ } \
+ s = format (s, "%s", t); \
+ return s; \
+}
+
+#define MACRO_UNFORMAT(lc) \
+uword \
+unformat_ikev2_##lc (unformat_input_t * input, \
+ va_list * args) \
+{ \
+ u32 * r = va_arg (*args, u32 *); \
+ if (0) ; \
+ foreach_ikev2_##lc \
+ else \
+ return 0; \
+ return 1; \
+}
+
+#define _(v,f,str) case IKEV2_AUTH_METHOD_##f: t = str; break;
+MACRO_FORMAT(auth_method)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_AUTH_METHOD_##f;
+MACRO_UNFORMAT(auth_method)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_TYPE_##f;
+MACRO_UNFORMAT(transform_type)
+#undef _
+
+#define _(v,f) case IKEV2_NOTIFY_MSG_##f: t = #f; break;
+MACRO_FORMAT(notify_msg_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_ID_TYPE_##f: t = str; break;
+MACRO_FORMAT(id_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_ID_TYPE_##f;
+MACRO_UNFORMAT(id_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_ENCR_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_encr_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_ENCR_TYPE_##f;
+MACRO_UNFORMAT(transform_encr_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_PRF_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_prf_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_PRF_TYPE_##f;
+MACRO_UNFORMAT(transform_prf_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_INTEG_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_integ_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_INTEG_TYPE_##f;
+MACRO_UNFORMAT(transform_integ_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_DH_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_dh_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_DH_TYPE_##f;
+MACRO_UNFORMAT(transform_dh_type)
+#undef _
+
+#define _(v,f,str) case IKEV2_TRANSFORM_ESN_TYPE_##f: t = str; break;
+MACRO_FORMAT(transform_esn_type)
+#undef _
+#define _(v,f,str) else if (unformat (input, str)) *r = IKEV2_TRANSFORM_ESN_TYPE_##f;
+MACRO_UNFORMAT(transform_esn_type)
+#undef _
+
diff --git a/vnet/vnet/ipsec/ikev2_payload.c b/vnet/vnet/ipsec/ikev2_payload.c
new file mode 100644
index 00000000000..f523fa81cba
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2_payload.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+#include <vnet/interface.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/ikev2.h>
+#include <vnet/ipsec/ikev2_priv.h>
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 protocol_id;
+ u8 spi_size;
+ u16 msg_type;
+ u8 payload[0];
+}) ike_notify_payload_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 ts_type;
+ u8 protocol_id;
+ u16 selector_len;
+ u16 start_port;
+ u16 end_port;
+ ip4_address_t start_addr;
+ ip4_address_t end_addr;
+}) ikev2_ts_payload_entry_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 num_ts;
+ u8 reserved[3];
+ ikev2_ts_payload_entry_t ts[0];
+}) ike_ts_payload_header_t;
+
+typedef CLIB_PACKED (struct {
+ u8 last_or_more;
+ u8 reserved;
+ u16 proposal_len;
+ u8 proposal_num;
+ u8 protocol_id;
+ u8 spi_size;
+ u8 num_transforms;
+ u32 spi[0];
+}) ike_sa_proposal_data_t;
+
+typedef CLIB_PACKED (struct {
+ u8 last_or_more;
+ u8 reserved;
+ u16 transform_len;
+ u8 transform_type;
+ u8 reserved2;
+ u16 transform_id;
+ u8 attributes[0];
+}) ike_sa_transform_data_t;
+
+typedef CLIB_PACKED (struct {
+ u8 nextpayload;
+ u8 flags;
+ u16 length;
+ u8 protocol_id;
+ u8 spi_size;
+ u16 num_of_spi;
+ u32 spi[0];
+}) ike_delete_payload_header_t;
+
+static ike_payload_header_t *
+ikev2_payload_add_hdr(ikev2_payload_chain_t * c, u8 payload_type, int len)
+{
+ ike_payload_header_t * hdr = (ike_payload_header_t *) &c->data[c->last_hdr_off];
+ u8 * tmp;
+
+ if (c->data)
+ hdr->nextpayload = payload_type;
+ else
+ c->first_payload_type = payload_type;
+
+ c->last_hdr_off = vec_len(c->data);
+ vec_add2(c->data, tmp, len);
+ hdr = (ike_payload_header_t *) tmp;
+ memset(hdr, 0, len);
+
+ hdr->length = clib_host_to_net_u16(len);
+
+ return hdr;
+}
+
+static void
+ikev2_payload_add_data(ikev2_payload_chain_t * c, u8 * data)
+{
+ u16 len;
+ ike_payload_header_t * hdr;
+
+ vec_append(c->data, data);
+ hdr = (ike_payload_header_t *) &c->data[c->last_hdr_off];
+ len = clib_net_to_host_u16(hdr->length);
+ hdr->length = clib_host_to_net_u16(len + vec_len(data));
+}
+
+void
+ikev2_payload_add_notify(ikev2_payload_chain_t * c, u16 msg_type, u8 * data)
+{
+ ike_notify_payload_header_t * n;
+
+ n = (ike_notify_payload_header_t *) ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_NOTIFY, sizeof (*n));
+ n->msg_type = clib_host_to_net_u16(msg_type);
+ ikev2_payload_add_data(c, data);
+}
+
+void
+ikev2_payload_add_sa(ikev2_payload_chain_t * c, ikev2_sa_proposal_t * proposals)
+{
+ ike_payload_header_t * ph;
+ ike_sa_proposal_data_t * prop;
+ ike_sa_transform_data_t * tr;
+ ikev2_sa_proposal_t * p;
+ ikev2_sa_transform_t * t;
+
+ u8 * tmp;
+ u8 * pr_data = 0;
+ u8 * tr_data = 0;
+
+ ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_SA, sizeof (*ph));
+
+ vec_foreach(p, proposals)
+ {
+ int spi_size = (p->protocol_id == IKEV2_PROTOCOL_ESP) ? 4 : 0;
+ pr_data = vec_new(u8, sizeof(ike_sa_proposal_data_t) + spi_size);
+ prop = (ike_sa_proposal_data_t *) pr_data;
+ prop->last_or_more = proposals - p + 1 < vec_len(proposals) ? 2 : 0;
+ prop->protocol_id = p->protocol_id;
+ prop->proposal_num = p->proposal_num;
+ prop->spi_size = spi_size;
+ prop->num_transforms = vec_len(p->transforms);
+
+ if (spi_size)
+ prop->spi[0] = clib_host_to_net_u32(p->spi);
+
+ DBG_PLD("proposal num %u protocol_id %u last_or_more %u spi_size %u%s%U",
+ prop->proposal_num, prop->protocol_id, prop->last_or_more,
+ prop->spi_size, prop->spi_size ? " spi_data " : "",
+ format_hex_bytes, prop->spi, prop->spi_size);
+
+ vec_foreach(t, p->transforms)
+ {
+ vec_add2(tr_data, tmp, sizeof(*tr) + vec_len(t->attrs));
+ tr = (ike_sa_transform_data_t *) tmp;
+ tr->last_or_more = ((t - p->transforms) + 1 < vec_len(p->transforms)) ? 3 : 0;
+ tr->transform_type = t->type;
+ tr->transform_id = clib_host_to_net_u16(t->transform_id);
+ tr->transform_len = clib_host_to_net_u16(sizeof(*tr) + vec_len(t->attrs));
+
+ if (vec_len(t->attrs) > 0)
+ memcpy(tr->attributes, t->attrs, vec_len(t->attrs));
+
+ DBG_PLD("transform type %U transform_id %u last_or_more %u attr_size %u%s%U",
+ format_ikev2_transform_type, tr->transform_type,
+ t->transform_id, tr->last_or_more, vec_len(t->attrs),
+ vec_len(t->attrs) ? " attrs " : "",
+ format_hex_bytes, tr->attributes, vec_len(t->attrs));
+ }
+
+ prop->proposal_len = clib_host_to_net_u16(vec_len(tr_data) + vec_len(pr_data));
+ ikev2_payload_add_data(c, pr_data);
+ ikev2_payload_add_data(c, tr_data);
+ vec_free(pr_data);
+ vec_free(tr_data);
+ }
+}
+
+void
+ikev2_payload_add_ke(ikev2_payload_chain_t * c, u16 dh_group, u8 * dh_data)
+{
+ ike_ke_payload_header_t * ke;
+ ke = (ike_ke_payload_header_t *) ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_KE,
+ sizeof (*ke));
+
+ ke->dh_group = clib_host_to_net_u16(dh_group);
+ ikev2_payload_add_data(c, dh_data);
+}
+
+void
+ikev2_payload_add_nonce(ikev2_payload_chain_t * c, u8 * nonce)
+{
+ ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_NONCE, sizeof (ike_payload_header_t));
+ ikev2_payload_add_data(c, nonce);
+}
+
+void
+ikev2_payload_add_id(ikev2_payload_chain_t *c, ikev2_id_t * id, u8 type)
+{
+ ike_id_payload_header_t * idp;
+ idp = (ike_id_payload_header_t *) ikev2_payload_add_hdr(c, type, sizeof (*idp));
+
+ idp->id_type = id->type;
+ ikev2_payload_add_data(c, id->data);
+}
+
+void
+ikev2_payload_add_delete(ikev2_payload_chain_t *c, ikev2_delete_t * d)
+{
+ ike_delete_payload_header_t * dp;
+ u16 num_of_spi = vec_len(d);
+ ikev2_delete_t * d2;
+ dp = (ike_delete_payload_header_t *) ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_DELETE,
+ sizeof (*dp));
+
+ if (d[0].protocol_id == IKEV2_PROTOCOL_IKE)
+ {
+ dp->protocol_id = 1;
+ }
+ else
+ {
+ dp->protocol_id = d[0].protocol_id;
+ dp->spi_size = 4;
+ dp->num_of_spi = clib_host_to_net_u16(num_of_spi);
+ vec_foreach(d2, d)
+ {
+ u8 * data = vec_new(u8, 4);
+ u32 spi = clib_host_to_net_u32(d2->spi);
+ memcpy(data, &spi, 4);
+ ikev2_payload_add_data(c, data);
+ vec_free(data);
+ }
+ }
+}
+
+void
+ikev2_payload_add_auth(ikev2_payload_chain_t *c, ikev2_auth_t * auth)
+{
+ ike_auth_payload_header_t * ap;
+ ap = (ike_auth_payload_header_t *) ikev2_payload_add_hdr(c, IKEV2_PAYLOAD_AUTH,
+ sizeof (*ap));
+
+ ap->auth_method = auth->method;
+ ikev2_payload_add_data(c, auth->data);
+}
+
+void
+ikev2_payload_add_ts(ikev2_payload_chain_t * c, ikev2_ts_t * ts, u8 type)
+{
+ ike_ts_payload_header_t * tsh;
+ ikev2_ts_t *ts2;
+ u8 * data = 0, * tmp;
+
+ tsh = (ike_ts_payload_header_t *) ikev2_payload_add_hdr(c, type, sizeof (*tsh));
+ tsh->num_ts = vec_len(ts);
+
+ vec_foreach(ts2, ts)
+ {
+ ASSERT(ts2->ts_type == 7); /*TS_IPV4_ADDR_RANGE */
+ ikev2_ts_payload_entry_t * entry;
+ vec_add2(data, tmp, sizeof(*entry));
+ entry = (ikev2_ts_payload_entry_t *) tmp;
+ entry->ts_type = ts2->ts_type;
+ entry->protocol_id = ts2->protocol_id;
+ entry->selector_len = clib_host_to_net_u16(16);
+ entry->start_port = clib_host_to_net_u16(ts2->start_port);
+ entry->end_port = clib_host_to_net_u16(ts2->end_port);
+ entry->start_addr.as_u32 = ts2->start_addr.as_u32;
+ entry->end_addr.as_u32 = ts2->end_addr.as_u32;
+ }
+
+ ikev2_payload_add_data(c, data);
+ vec_free(data);
+}
+
+void
+ikev2_payload_chain_add_padding(ikev2_payload_chain_t * c, int bs)
+{
+ u8 * tmp __attribute__((unused));
+ u8 pad_len = (vec_len(c->data) / bs + 1) * bs - vec_len(c->data);
+ vec_add2(c->data, tmp, pad_len);
+ c->data[vec_len(c->data)-1] = pad_len - 1;
+}
+
+ikev2_sa_proposal_t *
+ikev2_parse_sa_payload(ike_payload_header_t * ikep)
+{
+ ikev2_sa_proposal_t * v = 0;
+ ikev2_sa_proposal_t * proposal;
+ ikev2_sa_transform_t * transform;
+
+ u32 plen = clib_net_to_host_u16(ikep->length);
+
+ ike_sa_proposal_data_t * sap;
+ int proposal_ptr = 0;
+
+ do
+ {
+ sap = (ike_sa_proposal_data_t *) &ikep->payload[proposal_ptr];
+ int i;
+ int transform_ptr;
+
+ DBG_PLD("proposal num %u len %u last_or_more %u id %u "
+ "spi_size %u num_transforms %u",
+ sap->proposal_num, clib_net_to_host_u16(sap->proposal_len),
+ sap->last_or_more, sap->protocol_id, sap->spi_size,
+ sap->num_transforms);
+
+ /* IKE proposal should not have SPI */
+ if (sap->protocol_id == IKEV2_PROTOCOL_IKE && sap->spi_size != 0)
+ goto data_corrupted;
+
+ /* IKE proposal should not have SPI */
+ if (sap->protocol_id == IKEV2_PROTOCOL_ESP && sap->spi_size != 4)
+ goto data_corrupted;
+
+ transform_ptr = proposal_ptr + sizeof(*sap) + sap->spi_size;
+
+ vec_add2(v, proposal, 1);
+ proposal->proposal_num = sap->proposal_num;
+ proposal->protocol_id = sap->protocol_id;
+
+ if (sap->spi_size == 4) {
+ proposal->spi = clib_net_to_host_u32(sap->spi[0]);
+ }
+
+ for(i=0; i< sap->num_transforms; i++)
+ {
+ ike_sa_transform_data_t * tr = (ike_sa_transform_data_t *) &ikep->payload[transform_ptr];
+ u16 tlen = clib_net_to_host_u16(tr->transform_len);
+
+ if (tlen < sizeof(*tr))
+ goto data_corrupted;
+
+ vec_add2(proposal->transforms, transform, 1);
+
+ transform->type = tr->transform_type;
+ transform->transform_id = clib_net_to_host_u16(tr->transform_id);
+ if (tlen > sizeof(*tr))
+ vec_add(transform->attrs, tr->attributes, tlen - sizeof(*tr));
+
+ DBG_PLD("transform num %u len %u last_or_more %u type %U id %u%s%U",
+ i, tlen, tr->last_or_more,
+ format_ikev2_sa_transform, transform,
+ clib_net_to_host_u16(tr->transform_id),
+ tlen > sizeof(*tr) ? " attrs " : "",
+ format_hex_bytes, tr->attributes, tlen - sizeof (*tr));
+
+ transform_ptr += tlen;
+ }
+
+ proposal_ptr += clib_net_to_host_u16(sap->proposal_len);
+ }
+ while (proposal_ptr < (plen - sizeof(*ikep)) && sap->last_or_more == 2);
+
+ /* data validation */
+ if (proposal_ptr != (plen - sizeof(*ikep)) || sap->last_or_more)
+ goto data_corrupted;
+
+ return v;
+
+data_corrupted:
+ DBG_PLD("SA payload data corrupted");
+ ikev2_sa_free_proposal_vector(&v);
+ return 0;
+}
+
+ikev2_ts_t *
+ikev2_parse_ts_payload(ike_payload_header_t * ikep)
+{
+ ike_ts_payload_header_t * tsp = (ike_ts_payload_header_t *) ikep;
+ ikev2_ts_t * r = 0, *ts;
+ u8 i;
+
+ for (i = 0; i < tsp->num_ts; i++)
+ {
+ if (tsp->ts[i].ts_type != 7) /* TS_IPV4_ADDR_RANGE */
+ {
+ DBG_PLD("unsupported TS type received (%u)", tsp->ts[i].ts_type);
+ continue;
+ }
+
+ vec_add2(r, ts, 1);
+ ts->ts_type = tsp->ts[i].ts_type;
+ ts->protocol_id = tsp->ts[i].protocol_id;
+ ts->start_port = tsp->ts[i].start_port;
+ ts->end_port = tsp->ts[i].end_port;
+ ts->start_addr.as_u32 = tsp->ts[i].start_addr.as_u32;
+ ts->end_addr.as_u32 = tsp->ts[i].end_addr.as_u32;
+ }
+ return r;
+}
+
+ikev2_notify_t *
+ikev2_parse_notify_payload(ike_payload_header_t * ikep)
+{
+ ike_notify_payload_header_t * n = (ike_notify_payload_header_t *) ikep;
+ u32 plen = clib_net_to_host_u16(ikep->length);
+ ikev2_notify_t * r = 0;
+ u32 spi;
+
+ DBG_PLD("msg_type %U len %u%s%U",
+ format_ikev2_notify_msg_type, clib_net_to_host_u16(n->msg_type),
+ plen, plen > sizeof(*n) ? " data ":"",
+ format_hex_bytes, n->payload, plen - sizeof(*n));
+
+ r = vec_new(ikev2_notify_t, 1);
+ r->msg_type = clib_net_to_host_u16(n->msg_type);
+ r->protocol_id = n->protocol_id;
+
+ if (n->spi_size == 4)
+ {
+ memcpy(&spi, n->payload, n->spi_size);
+ r->spi = clib_net_to_host_u32(spi);
+ DBG_PLD("spi %lx", r->spi);
+ }
+ else if (n->spi_size == 0)
+ {
+ r->spi = 0;
+ }
+ else
+ {
+ clib_warning("invalid SPI Size %d", n->spi_size);
+ }
+
+ if (plen > (sizeof(*n) + n->spi_size))
+ {
+ vec_add(r->data, n->payload + n->spi_size, plen - sizeof(*n) - n->spi_size);
+ }
+
+ return r;
+}
+
+void
+ikev2_parse_vendor_payload(ike_payload_header_t * ikep)
+{
+ u32 plen = clib_net_to_host_u16(ikep->length);
+ int i;
+ int is_string = 1;
+
+ for(i=0; i < plen - 4; i++)
+ if (!isprint(ikep->payload[i]))
+ is_string = 0;
+
+ DBG_PLD("len %u data %s:%U",
+ plen,
+ is_string ? "string":"hex",
+ is_string ? format_ascii_bytes : format_hex_bytes,
+ ikep->payload, plen - sizeof(*ikep));
+}
+
+ikev2_delete_t *
+ikev2_parse_delete_payload(ike_payload_header_t * ikep)
+{
+ ike_delete_payload_header_t * d = (ike_delete_payload_header_t *) ikep;
+ u32 plen = clib_net_to_host_u16(ikep->length);
+ ikev2_delete_t * r = 0, * del;
+ u16 num_of_spi = clib_net_to_host_u16(d->num_of_spi);
+ u16 i = 0;
+
+ DBG_PLD("protocol_id %u spi_size %u num_of_spi %u len %u%s%U",
+ d->protocol_id, d->spi_size, num_of_spi,
+ plen, plen > sizeof(d) ? " data ":"",
+ format_hex_bytes, d->spi, plen - sizeof(*d));
+
+ if (d->protocol_id == IKEV2_PROTOCOL_IKE)
+ {
+ r = vec_new(ikev2_delete_t, 1);
+ r->protocol_id = 1;
+ }
+ else
+ {
+ r = vec_new(ikev2_delete_t, num_of_spi);
+ vec_foreach(del, r)
+ {
+ del->protocol_id = d->protocol_id;
+ del->spi = clib_net_to_host_u32(d->spi[i++]);
+ }
+ }
+
+ return r;
+}
diff --git a/vnet/vnet/ipsec/ikev2_priv.h b/vnet/vnet/ipsec/ikev2_priv.h
new file mode 100644
index 00000000000..4f05a60a621
--- /dev/null
+++ b/vnet/vnet/ipsec/ikev2_priv.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_ikev2_priv_h__
+#define __included_ikev2_priv_h__
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vnet/ipsec/ikev2.h>
+
+#include <vppinfra/hash.h>
+#include <vppinfra/elog.h>
+#include <vppinfra/error.h>
+
+#include <openssl/rand.h>
+#include <openssl/dh.h>
+#include <openssl/hmac.h>
+#include <openssl/evp.h>
+
+#define IKEV2_DEBUG_PAYLOAD 1
+
+#if IKEV2_DEBUG_PAYLOAD == 1
+#define DBG_PLD(my_args...) clib_warning(my_args)
+#else
+#define DBG_PLD(my_args...)
+#endif
+
+typedef enum {
+ IKEV2_STATE_UNKNOWN,
+ IKEV2_STATE_SA_INIT,
+ IKEV2_STATE_DELETED,
+ IKEV2_STATE_AUTH_FAILED,
+ IKEV2_STATE_AUTHENTICATED,
+ IKEV2_STATE_NOTIFY_AND_DELETE,
+ IKEV2_STATE_TS_UNACCEPTABLE,
+ IKEV2_STATE_NO_PROPOSAL_CHOSEN,
+} ikev2_state_t;
+
+typedef struct {
+ ikev2_auth_method_t method:8;
+ u8 * data;
+ u8 hex; /* hex encoding of the shared secret */
+ EVP_PKEY * key;
+} ikev2_auth_t;
+
+typedef enum {
+ IKEV2_DH_GROUP_MODP = 0,
+ IKEV2_DH_GROUP_ECP = 1,
+} ikev2_dh_group_t;
+
+typedef struct {
+ ikev2_transform_type_t type;
+ union {
+ u16 transform_id;
+ ikev2_transform_encr_type_t encr_type:16;
+ ikev2_transform_prf_type_t prf_type:16;
+ ikev2_transform_integ_type_t integ_type:16;
+ ikev2_transform_dh_type_t dh_type:16;
+ ikev2_transform_esn_type_t esn_type:16;
+ };
+ u8 * attrs;
+ u16 key_len;
+ u16 key_trunc;
+ u16 block_size;
+ u8 dh_group;
+ int nid;
+ const char * dh_p;
+ const char * dh_g;
+ const void * md;
+ const void * cipher;
+} ikev2_sa_transform_t;
+
+typedef struct {
+ u8 proposal_num;
+ ikev2_protocol_id_t protocol_id:8;
+ u32 spi;
+ ikev2_sa_transform_t * transforms;
+} ikev2_sa_proposal_t;
+
+typedef struct {
+ u8 ts_type;
+ u8 protocol_id;
+ u16 selector_len;
+ u16 start_port;
+ u16 end_port;
+ ip4_address_t start_addr;
+ ip4_address_t end_addr;
+} ikev2_ts_t;
+
+typedef struct {
+ ikev2_id_type_t type:8;
+ u8 * data;
+} ikev2_id_t;
+
+typedef struct {
+ /* sa proposals vectors */
+ ikev2_sa_proposal_t * i_proposals;
+ ikev2_sa_proposal_t * r_proposals;
+
+ /* Traffic Selectors */
+ ikev2_ts_t * tsi;
+ ikev2_ts_t * tsr;
+
+ /* keys */
+ u8 * sk_ai;
+ u8 * sk_ar;
+ u8 * sk_ei;
+ u8 * sk_er;
+} ikev2_child_sa_t;
+
+typedef struct {
+ u8 protocol_id;
+ u32 spi; /*for ESP and AH SPI size is 4, for IKE size is 0 */
+} ikev2_delete_t;
+
+typedef struct {
+ u8 protocol_id;
+ u32 spi;
+ ikev2_sa_proposal_t * i_proposal;
+ ikev2_sa_proposal_t * r_proposal;
+ ikev2_ts_t * tsi;
+ ikev2_ts_t * tsr;
+} ikev2_rekey_t;
+
+typedef struct {
+ u16 msg_type;
+ u8 protocol_id;
+ u32 spi;
+ u8 * data;
+} ikev2_notify_t;
+
+
+typedef struct {
+ ikev2_state_t state;
+ u8 unsupported_cp;
+ u8 initial_contact;
+ ip4_address_t iaddr;
+ ip4_address_t raddr;
+ u64 ispi;
+ u64 rspi;
+ u8 * i_nonce;
+ u8 * r_nonce;
+
+ /* DH data */
+ u16 dh_group;
+ u8 * dh_shared_key;
+ u8 * i_dh_data;
+ u8 * r_dh_data;
+
+ /* sa proposals vectors */
+ ikev2_sa_proposal_t * i_proposals;
+ ikev2_sa_proposal_t * r_proposals;
+
+ /* keys */
+ u8 * sk_d;
+ u8 * sk_ai;
+ u8 * sk_ar;
+ u8 * sk_ei;
+ u8 * sk_er;
+ u8 * sk_pi;
+ u8 * sk_pr;
+
+ /* auth */
+ ikev2_auth_t i_auth;
+ ikev2_auth_t r_auth;
+
+ /* ID */
+ ikev2_id_t i_id;
+ ikev2_id_t r_id;
+
+ /* pending deletes */
+ ikev2_delete_t * del;
+
+ /* pending rekeyings */
+ ikev2_rekey_t * rekey;
+
+ /* packet data */
+ u8 * last_sa_init_req_packet_data;
+ u8 * last_sa_init_res_packet_data;
+
+ /* retransmit */
+ u32 last_msg_id;
+ u8 * last_res_packet_data;
+
+ ikev2_child_sa_t * childs;
+} ikev2_sa_t;
+
+typedef struct {
+ u8 * name;
+ u8 is_enabled;
+
+ ikev2_auth_t auth;
+ ikev2_id_t loc_id;
+ ikev2_id_t rem_id;
+ ikev2_ts_t loc_ts;
+ ikev2_ts_t rem_ts;
+} ikev2_profile_t;
+
+typedef struct {
+ /* pool of IKEv2 Security Associations */
+ ikev2_sa_t * sas;
+
+ /* pool of IKEv2 profiles */
+ ikev2_profile_t * profiles;
+
+ /* vector of supported transform types */
+ ikev2_sa_transform_t * supported_transforms;
+
+ /* hashes */
+ uword * sa_by_rspi;
+ mhash_t profile_index_by_name;
+
+ /* local private key */
+ EVP_PKEY * pkey;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} ikev2_main_t;
+
+ikev2_main_t ikev2_main;
+
+void ikev2_sa_free_proposal_vector(ikev2_sa_proposal_t ** v);
+ikev2_sa_transform_t * ikev2_sa_get_td_for_type(ikev2_sa_proposal_t * p,
+ ikev2_transform_type_t type);
+
+/* ikev2_crypto.c */
+v8 * ikev2_calc_prf(ikev2_sa_transform_t * tr, v8 * key, v8 * data);
+u8 * ikev2_calc_prfplus(ikev2_sa_transform_t * tr, u8 * key, u8 * seed, int len);
+v8 * ikev2_calc_integr(ikev2_sa_transform_t * tr, v8 * key, u8 * data, int len);
+v8 * ikev2_decrypt_data(ikev2_sa_t * sa, u8 * data, int len);
+int ikev2_encrypt_data(ikev2_sa_t * sa, v8 * src, u8 * dst);
+void ikev2_generate_dh(ikev2_sa_t * sa, ikev2_sa_transform_t * t);
+int ikev2_verify_sign(EVP_PKEY *pkey, u8 * sigbuf, u8 * data);
+u8 * ikev2_calc_sign(EVP_PKEY *pkey, u8 * data);
+EVP_PKEY * ikev2_load_cert_file(u8 * file);
+EVP_PKEY * ikev2_load_key_file(u8 * file);
+void ikev2_crypto_init (ikev2_main_t * km);
+
+/* ikev2_payload.c */
+typedef struct {
+ u8 first_payload_type;
+ u16 last_hdr_off;
+ u8 * data;
+} ikev2_payload_chain_t;
+
+#define ikev2_payload_new_chain(V) vec_validate (V, 0)
+#define ikev2_payload_destroy_chain(V) do { \
+ vec_free((V)->data); \
+ vec_free(V); \
+} while (0)
+
+void ikev2_payload_add_notify(ikev2_payload_chain_t * c, u16 msg_type, u8 * data);
+void ikev2_payload_add_sa(ikev2_payload_chain_t * c, ikev2_sa_proposal_t * proposals);
+void ikev2_payload_add_ke(ikev2_payload_chain_t * c, u16 dh_group, u8 * dh_data);
+void ikev2_payload_add_nonce(ikev2_payload_chain_t * c, u8 * nonce);
+void ikev2_payload_add_id(ikev2_payload_chain_t *c, ikev2_id_t * id, u8 type);
+void ikev2_payload_add_auth(ikev2_payload_chain_t *c, ikev2_auth_t * auth);
+void ikev2_payload_add_ts(ikev2_payload_chain_t * c, ikev2_ts_t * ts, u8 type);
+void ikev2_payload_add_delete(ikev2_payload_chain_t *c, ikev2_delete_t * d);
+void ikev2_payload_chain_add_padding(ikev2_payload_chain_t * c, int bs);
+void ikev2_parse_vendor_payload(ike_payload_header_t * ikep);
+ikev2_sa_proposal_t * ikev2_parse_sa_payload(ike_payload_header_t * ikep);
+ikev2_ts_t * ikev2_parse_ts_payload(ike_payload_header_t * ikep);
+ikev2_delete_t * ikev2_parse_delete_payload(ike_payload_header_t * ikep);
+ikev2_notify_t * ikev2_parse_notify_payload(ike_payload_header_t * ikep);
+
+#endif /* __included_ikev2_priv_h__ */
+
diff --git a/vnet/vnet/ipsec/ipsec.c b/vnet/vnet/ipsec/ipsec.c
new file mode 100644
index 00000000000..c6a83557ce1
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec.c
@@ -0,0 +1,535 @@
+/*
+ * decap.c : IPSec tunnel support
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+#include <vnet/interface.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+#include <vnet/ipsec/ikev2.h>
+
+int
+ipsec_set_interface_spd(vlib_main_t * vm, u32 sw_if_index, u32 spd_id, int is_add)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ip_lookup_main_t * lm;
+ ip_config_main_t * rx_cm;
+ ip4_ipsec_config_t config;
+
+ u32 spd_index, ci;
+ uword *p;
+
+ p = hash_get (im->spd_index_by_spd_id, spd_id);
+ if (!p)
+ return VNET_API_ERROR_SYSCALL_ERROR_1; /* no such spd-id */
+
+ spd_index = p[0];
+
+ p = hash_get (im->spd_index_by_sw_if_index, sw_if_index);
+ if (p && is_add)
+ return VNET_API_ERROR_SYSCALL_ERROR_1; /* spd already assigned */
+
+ if (is_add)
+ {
+ hash_set (im->spd_index_by_sw_if_index, sw_if_index, spd_index);
+ }
+ else
+ {
+ hash_unset (im->spd_index_by_sw_if_index, sw_if_index);
+ }
+
+ clib_warning("sw_if_index %u spd_id %u spd_index %u",
+ sw_if_index, spd_id, spd_index);
+
+ /* enable IPsec on TX */
+ vnet_interface_add_del_feature(im->vnet_main, vm, sw_if_index,
+ INTF_OUTPUT_FEAT_IPSEC, is_add);
+
+ /* enable IPsec on RX */
+ config.spd_index = spd_index;
+
+ /* IPv4 */
+ lm = &ip4_main.lookup_main;
+ rx_cm = &lm->rx_config_mains[VNET_UNICAST];
+
+ ci = rx_cm->config_index_by_sw_if_index[sw_if_index];
+
+ ci = (is_add ? vnet_config_add_feature : vnet_config_del_feature)
+ (vm, &rx_cm->config_main,
+ ci,
+ IP4_RX_FEATURE_IPSEC,
+ &config,
+ sizeof (config));
+ rx_cm->config_index_by_sw_if_index[sw_if_index] = ci;
+
+ /* IPv6 */
+ lm = &ip6_main.lookup_main;
+ rx_cm = &lm->rx_config_mains[VNET_UNICAST];
+
+ ci = rx_cm->config_index_by_sw_if_index[sw_if_index];
+
+ ci = (is_add ? vnet_config_add_feature : vnet_config_del_feature)
+ (vm, &rx_cm->config_main,
+ ci,
+ IP6_RX_FEATURE_IPSEC,
+ &config,
+ sizeof (config));
+ rx_cm->config_index_by_sw_if_index[sw_if_index] = ci;
+
+ return 0;
+}
+
+int
+ipsec_add_del_spd(vlib_main_t * vm, u32 spd_id, int is_add)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_spd_t * spd = 0;
+ uword *p;
+ u32 spd_index, k, v;
+
+ p = hash_get (im->spd_index_by_spd_id, spd_id);
+ if (p && is_add)
+ return VNET_API_ERROR_INVALID_VALUE;
+ if (!p && !is_add)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ if (!is_add) /* delete */
+ {
+ spd_index = p[0];
+ spd = pool_elt_at_index(im->spds, spd_index);
+ if (!spd)
+ return VNET_API_ERROR_INVALID_VALUE;
+ hash_foreach (k, v, im->spd_index_by_sw_if_index, ({
+ if (v == spd_index)
+ ipsec_set_interface_spd(vm, k, spd_id, 0);
+ }));
+ hash_unset (im->spd_index_by_spd_id, spd_id);
+ pool_free (spd->policies);
+ vec_free (spd->ipv4_outbound_policies);
+ vec_free (spd->ipv6_outbound_policies);
+ vec_free (spd->ipv4_inbound_protect_policy_indices);
+ vec_free (spd->ipv4_inbound_policy_discard_and_bypass_indices);
+ pool_put (im->spds, spd);
+ }
+ else /* create new SPD */
+ {
+ pool_get (im->spds, spd);
+ memset (spd, 0, sizeof (*spd));
+ spd_index = spd - im->spds;
+ spd->id = spd_id;
+ hash_set (im->spd_index_by_spd_id, spd_id, spd_index);
+ }
+ return 0;
+}
+
+static int
+ipsec_spd_entry_sort(void * a1, void * a2)
+{
+ ipsec_main_t *im = &ipsec_main;
+ u32 * id1 = a1;
+ u32 * id2 = a2;
+ ipsec_spd_t * spd;
+ ipsec_policy_t * p1, * p2;
+
+ pool_foreach (spd, im->spds, ({
+ p1 = pool_elt_at_index(spd->policies, *id1);
+ p2 = pool_elt_at_index(spd->policies, *id2);
+ if (p1 && p2)
+ return p2->priority - p1->priority;
+ }));
+
+ return 0;
+}
+
+int
+ipsec_add_del_policy(vlib_main_t * vm, ipsec_policy_t * policy, int is_add)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_spd_t * spd = 0;
+ ipsec_policy_t * vp;
+ uword *p;
+ u32 spd_index;
+
+ clib_warning("policy-id %u priority %d is_outbound %u",policy->id, policy->priority, policy->is_outbound);
+
+ if (policy->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ p = hash_get(im->sa_index_by_sa_id, policy->sa_id);
+ if (!p)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+ policy->sa_index = p[0];
+ }
+
+ p = hash_get (im->spd_index_by_spd_id, policy->id);
+
+ if (!p)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+
+ spd_index = p[0];
+ spd = pool_elt_at_index(im->spds, spd_index);
+ if (!spd)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+
+ if (is_add)
+ {
+ u32 policy_index;
+
+ pool_get (spd->policies, vp);
+ memcpy (vp, policy, sizeof (*vp));
+ policy_index = vp - spd->policies;
+
+ if (policy->is_outbound)
+ {
+ if (policy->is_ipv6)
+ {
+ vec_add1 (spd->ipv6_outbound_policies, policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (spd->ipv6_outbound_policies,
+ ipsec_spd_entry_sort);
+ }
+ else
+ {
+ vec_add1 (spd->ipv4_outbound_policies, policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (spd->ipv4_outbound_policies,
+ ipsec_spd_entry_sort);
+ }
+ }
+ else
+ {
+ if (policy->is_ipv6)
+ {
+ if (policy->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ vec_add1 (spd->ipv6_inbound_protect_policy_indices,
+ policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (
+ spd->ipv6_inbound_protect_policy_indices,
+ ipsec_spd_entry_sort);
+ }
+ else
+ {
+ vec_add1 (spd->ipv6_inbound_policy_discard_and_bypass_indices,
+ policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (
+ spd->ipv6_inbound_policy_discard_and_bypass_indices,
+ ipsec_spd_entry_sort);
+ }
+ }
+ else
+ {
+ if (policy->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ vec_add1 (spd->ipv4_inbound_protect_policy_indices,
+ policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (
+ spd->ipv4_inbound_protect_policy_indices,
+ ipsec_spd_entry_sort);
+ }
+ else
+ {
+ vec_add1 (spd->ipv4_inbound_policy_discard_and_bypass_indices,
+ policy_index);
+ memcpy(vp, policy, sizeof(ipsec_policy_t));
+ vec_sort_with_function (
+ spd->ipv4_inbound_policy_discard_and_bypass_indices,
+ ipsec_spd_entry_sort);
+ }
+ }
+ }
+
+ }
+ else
+ {
+ u32 i, j;
+ pool_foreach_index(i, spd->policies, ({
+ vp = pool_elt_at_index(spd->policies, i);
+ if (vp->priority != policy->priority)
+ continue;
+ if (vp->is_outbound != policy->is_outbound)
+ continue;
+ if (vp->policy != policy->policy)
+ continue;
+ if (vp->sa_id != policy->sa_id)
+ continue;
+ if (vp->protocol != policy->protocol)
+ continue;
+ if (vp->lport.start != policy->lport.start)
+ continue;
+ if (vp->lport.stop != policy->lport.stop)
+ continue;
+ if (vp->rport.start != policy->rport.start)
+ continue;
+ if (vp->rport.stop != policy->rport.stop)
+ continue;
+ if (vp->is_ipv6 != policy->is_ipv6)
+ continue;
+ if (policy->is_ipv6)
+ {
+ if (vp->laddr.start.ip6.as_u64[0] != policy->laddr.start.ip6.as_u64[0])
+ continue;
+ if (vp->laddr.start.ip6.as_u64[1] != policy->laddr.start.ip6.as_u64[1])
+ continue;
+ if (vp->laddr.stop.ip6.as_u64[0] != policy->laddr.stop.ip6.as_u64[0])
+ continue;
+ if (vp->laddr.stop.ip6.as_u64[1] != policy->laddr.stop.ip6.as_u64[1])
+ continue;
+ if (vp->raddr.start.ip6.as_u64[0] != policy->raddr.start.ip6.as_u64[0])
+ continue;
+ if (vp->raddr.start.ip6.as_u64[1] != policy->raddr.start.ip6.as_u64[1])
+ continue;
+ if (vp->raddr.stop.ip6.as_u64[0] != policy->raddr.stop.ip6.as_u64[0])
+ continue;
+ if (vp->laddr.stop.ip6.as_u64[1] != policy->laddr.stop.ip6.as_u64[1])
+ continue;
+ if (policy->is_outbound)
+ {
+ vec_foreach_index(j, spd->ipv6_outbound_policies) {
+ if (vec_elt(spd->ipv6_outbound_policies, j) == i) {
+ vec_del1 (spd->ipv6_outbound_policies, j);
+ break;
+ }
+ }
+ }
+ else
+ {
+ if (policy->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ vec_foreach_index(j, spd->ipv6_inbound_protect_policy_indices) {
+ if (vec_elt(spd->ipv6_inbound_protect_policy_indices, j) == i) {
+ vec_del1 (spd->ipv6_inbound_protect_policy_indices, j);
+ break;
+ }
+ }
+ }
+ else
+ {
+ vec_foreach_index(j, spd->ipv6_inbound_policy_discard_and_bypass_indices) {
+ if (vec_elt(spd->ipv6_inbound_policy_discard_and_bypass_indices, j) == i) {
+ vec_del1 (spd->ipv6_inbound_policy_discard_and_bypass_indices, j);
+ break;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (vp->laddr.start.ip4.as_u32 != policy->laddr.start.ip4.as_u32)
+ continue;
+ if (vp->laddr.stop.ip4.as_u32 != policy->laddr.stop.ip4.as_u32)
+ continue;
+ if (vp->raddr.start.ip4.as_u32 != policy->raddr.start.ip4.as_u32)
+ continue;
+ if (vp->raddr.stop.ip4.as_u32 != policy->raddr.stop.ip4.as_u32)
+ continue;
+ if (policy->is_outbound)
+ {
+ vec_foreach_index(j, spd->ipv4_outbound_policies) {
+ if (vec_elt(spd->ipv4_outbound_policies, j) == i) {
+ vec_del1 (spd->ipv4_outbound_policies, j);
+ break;
+ }
+ }
+ }
+ else
+ {
+ if (policy->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ vec_foreach_index(j, spd->ipv4_inbound_protect_policy_indices) {
+ if (vec_elt(spd->ipv4_inbound_protect_policy_indices, j) == i) {
+ vec_del1 (spd->ipv4_inbound_protect_policy_indices, j);
+ break;
+ }
+ }
+ }
+ else
+ {
+ vec_foreach_index(j, spd->ipv4_inbound_policy_discard_and_bypass_indices) {
+ if (vec_elt(spd->ipv4_inbound_policy_discard_and_bypass_indices, j) == i) {
+ vec_del1 (spd->ipv4_inbound_policy_discard_and_bypass_indices, j);
+ break;
+ }
+ }
+ }
+ }
+ pool_put (spd->policies, vp);
+ break;
+ }
+ }));
+ }
+
+ return 0;
+}
+
+static u8
+ipsec_is_sa_used(u32 sa_index)
+{
+ ipsec_main_t * im = &ipsec_main;
+ ipsec_spd_t * spd;
+ ipsec_policy_t * p;
+
+ pool_foreach(spd, im->spds, ({
+ pool_foreach(p, spd->policies, ({
+ if (p->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ if (p->sa_index == sa_index)
+ return 1;
+ }
+ }));
+ }));
+
+ return 0;
+}
+
+int
+ipsec_add_del_sa(vlib_main_t * vm, ipsec_sa_t * new_sa, int is_add)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_sa_t * sa = 0;
+ uword *p;
+ u32 sa_index;
+
+ clib_warning("id %u spi %u", new_sa->id, new_sa->spi);
+
+ p = hash_get (im->sa_index_by_sa_id, new_sa->id);
+ if (p && is_add)
+ return VNET_API_ERROR_SYSCALL_ERROR_1; /* already exists */
+ if (!p && !is_add)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+
+ if (!is_add) /* delete */
+ {
+ sa_index = p[0];
+ sa = pool_elt_at_index(im->sad, sa_index);
+ if (ipsec_is_sa_used(sa_index))
+ {
+ clib_warning("sa_id %u used in policy", sa->id);
+ return VNET_API_ERROR_SYSCALL_ERROR_1; /* sa used in policy */
+ }
+ hash_unset (im->sa_index_by_sa_id, sa->id);
+ pool_put (im->sad, sa);
+ }
+ else /* create new SA */
+ {
+ pool_get (im->sad, sa);
+ memcpy (sa, new_sa, sizeof (*sa));
+ sa_index = sa - im->sad;
+ hash_set (im->sa_index_by_sa_id, sa->id, sa_index);
+ }
+ return 0;
+}
+
+int
+ipsec_set_sa_key(vlib_main_t * vm, ipsec_sa_t * sa_update)
+{
+ ipsec_main_t *im = &ipsec_main;
+ uword *p;
+ u32 sa_index;
+ ipsec_sa_t * sa = 0;
+
+ p = hash_get (im->sa_index_by_sa_id, sa_update->id);
+ if (!p)
+ return VNET_API_ERROR_SYSCALL_ERROR_1; /* no such sa-id */
+
+ sa_index = p[0];
+ sa = pool_elt_at_index(im->sad, sa_index);
+
+ /* new crypto key */
+ if (0 < sa_update->crypto_key_len)
+ {
+ memcpy(sa->crypto_key, sa_update->crypto_key, sa_update->crypto_key_len);
+ sa->crypto_key_len = sa_update->crypto_key_len;
+ }
+
+ /* new integ key */
+ if (0 < sa_update->integ_key_len)
+ {
+ memcpy(sa->integ_key, sa_update->integ_key, sa_update->integ_key_len);
+ sa->integ_key_len = sa_update->integ_key_len;
+ }
+
+ return 0;
+}
+
+static void
+ipsec_rand_seed(void)
+{
+ struct {
+ time_t time;
+ pid_t pid;
+ void * p;
+ } seed_data;
+
+ seed_data.time = time(NULL);
+ seed_data.pid = getpid();
+ seed_data.p = (void *)&seed_data;
+
+ RAND_seed((const void *)&seed_data, sizeof(seed_data));
+}
+
+static clib_error_t *
+ipsec_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ ipsec_main_t * im = &ipsec_main;
+ vlib_node_t * node;
+
+ ipsec_rand_seed();
+
+ memset (im, 0, sizeof (im[0]));
+
+ im->vnet_main = vnet_get_main();
+ im->vlib_main = vm;
+
+ im->spd_index_by_spd_id = hash_create (0, sizeof (uword));
+ im->sa_index_by_sa_id = hash_create (0, sizeof (uword));
+ im->spd_index_by_sw_if_index = hash_create (0, sizeof (uword));
+
+ node = vlib_get_node_by_name (vm, (u8 *) "error-drop");
+ ASSERT(node);
+ im->error_drop_node_index = node->index;
+
+ node = vlib_get_node_by_name (vm, (u8 *) "esp-encrypt");
+ ASSERT(node);
+ im->esp_encrypt_node_index = node->index;
+
+ node = vlib_get_node_by_name (vm, (u8 *) "ip4-lookup");
+ ASSERT(node);
+ im->ip4_lookup_node_index = node->index;
+
+
+ if ((error = vlib_call_init_function (vm, ipsec_cli_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ipsec_tunnel_if_init)))
+ return error;
+
+ esp_init();
+
+ if ((error = ikev2_init (vm)))
+ return error;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ipsec_init);
diff --git a/vnet/vnet/ipsec/ipsec.h b/vnet/vnet/ipsec/ipsec.h
new file mode 100644
index 00000000000..6ef36d02855
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/devices/dpdk/dpdk.h>
+
+#define foreach_ipsec_policy_action \
+ _(0, BYPASS, "bypass") \
+ _(1, DISCARD, "discard") \
+ _(2, RESOLVE, "resolve") \
+ _(3, PROTECT, "protect")
+
+typedef enum {
+#define _(v,f,s) IPSEC_POLICY_ACTION_##f = v,
+ foreach_ipsec_policy_action
+#undef _
+ IPSEC_POLICY_N_ACTION,
+} ipsec_policy_action_t;
+
+#define foreach_ipsec_crypto_alg \
+ _(0, NONE, "none") \
+ _(1, AES_CBC_128, "aes-cbc-128") \
+ _(2, AES_CBC_192, "aes-cbc-192") \
+ _(3, AES_CBC_256, "aes-cbc-256")
+
+typedef enum {
+#define _(v,f,s) IPSEC_CRYPTO_ALG_##f = v,
+ foreach_ipsec_crypto_alg
+#undef _
+ IPSEC_CRYPTO_N_ALG,
+} ipsec_crypto_alg_t;
+
+#define foreach_ipsec_integ_alg \
+ _(0, NONE, "none") \
+ _(1, MD5_96, "md5-96") /* RFC2403 */ \
+ _(2, SHA1_96, "sha1-96") /* RFC2404 */ \
+ _(3, SHA_256_96, "sha-256-96") /* draft-ietf-ipsec-ciph-sha-256-00 */ \
+ _(4, SHA_256_128, "sha-256-128") /* RFC4868 */ \
+ _(5, SHA_384_192, "sha-384-192") /* RFC4868 */ \
+ _(6, SHA_512_256, "sha-512-256") /* RFC4868 */
+
+typedef enum {
+#define _(v,f,s) IPSEC_INTEG_ALG_##f = v,
+ foreach_ipsec_integ_alg
+#undef _
+ IPSEC_INTEG_N_ALG,
+} ipsec_integ_alg_t;
+
+typedef enum {
+ IPSEC_PROTOCOL_AH = 0,
+ IPSEC_PROTOCOL_ESP = 1
+} ipsec_protocol_t;
+
+typedef struct {
+ u32 id;
+ u32 spi;
+ ipsec_protocol_t protocol;
+
+ ipsec_crypto_alg_t crypto_alg;
+ u8 crypto_key_len;
+ u8 crypto_key[128];
+
+ ipsec_integ_alg_t integ_alg;
+ u8 integ_key_len;
+ u8 integ_key[128];
+
+ u8 use_esn;
+ u8 use_anti_replay;
+
+ u8 is_tunnel;
+ u8 is_tunnel_ip6;
+ ip46_address_t tunnel_src_addr;
+ ip46_address_t tunnel_dst_addr;
+
+ /* runtime */
+ u32 seq;
+ u32 seq_hi;
+ u32 last_seq;
+ u32 last_seq_hi;
+ u64 replay_window;
+} ipsec_sa_t;
+
+typedef struct {
+ ip46_address_t start, stop;
+} ip46_address_range_t;
+
+typedef struct {
+ u16 start, stop;
+} port_range_t;
+
+typedef struct {
+ u8 is_add;
+ u8 esn;
+ u8 anti_replay;
+ ip4_address_t local_ip, remote_ip;
+ u32 local_spi;
+ u32 remote_spi;
+} ipsec_add_del_tunnel_args_t;
+
+typedef enum {
+ IPSEC_IF_SET_KEY_TYPE_NONE,
+ IPSEC_IF_SET_KEY_TYPE_LOCAL_CRYPTO,
+ IPSEC_IF_SET_KEY_TYPE_REMOTE_CRYPTO,
+ IPSEC_IF_SET_KEY_TYPE_LOCAL_INTEG,
+ IPSEC_IF_SET_KEY_TYPE_REMOTE_INTEG,
+} ipsec_if_set_key_type_t;
+
+typedef struct {
+ u32 id;
+ i32 priority;
+ u8 is_outbound;
+
+ // Selector
+ u8 is_ipv6;
+ ip46_address_range_t laddr;
+ ip46_address_range_t raddr;
+ u8 protocol;
+ port_range_t lport;
+ port_range_t rport;
+
+ // Policy
+ u8 policy;
+ u32 sa_id;
+ u32 sa_index;
+
+ // Counter
+ vlib_counter_t counter;
+} ipsec_policy_t;
+
+typedef struct {
+ u32 id;
+ /* pool of policies */
+ ipsec_policy_t * policies;
+ /* vectors of policy indices */
+ u32 * ipv4_outbound_policies;
+ u32 * ipv6_outbound_policies;
+ u32 * ipv4_inbound_protect_policy_indices;
+ u32 * ipv4_inbound_policy_discard_and_bypass_indices;
+ u32 * ipv6_inbound_protect_policy_indices;
+ u32 * ipv6_inbound_policy_discard_and_bypass_indices;
+} ipsec_spd_t;
+
+typedef struct {
+ u32 spd_index;
+} ip4_ipsec_config_t;
+
+typedef struct {
+ u32 spd_index;
+} ip6_ipsec_config_t;
+
+typedef struct {
+ u32 input_sa_index;
+ u32 output_sa_index;
+ u32 hw_if_index;
+} ipsec_tunnel_if_t;
+
+typedef struct {
+ /* pool of tunnel instances */
+ ipsec_spd_t * spds;
+ ipsec_sa_t * sad;
+
+ /* pool of tunnel interfaces */
+ ipsec_tunnel_if_t * tunnel_interfaces;
+ u32 * free_tunnel_if_indices;
+
+ u32 * empty_buffers;
+
+ uword * tunnel_index_by_key;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+
+ /* next node indices */
+ u32 feature_next_node_index[32];
+
+ /* hashes */
+ uword * spd_index_by_spd_id;
+ uword * spd_index_by_sw_if_index;
+ uword * sa_index_by_sa_id;
+ uword * ipsec_if_pool_index_by_key;
+
+ /* node indexes */
+ u32 error_drop_node_index;
+ u32 ip4_lookup_node_index;
+ u32 esp_encrypt_node_index;
+
+} ipsec_main_t;
+
+ipsec_main_t ipsec_main;
+
+vlib_node_registration_t ipsec_input_ip4_node;
+vlib_node_registration_t ipsec_input_ip6_node;
+vlib_node_registration_t ipsec_output_node;
+vlib_node_registration_t esp_encrypt_node;
+vlib_node_registration_t esp_decrypt_node;
+vlib_node_registration_t ipsec_if_output_node;
+vlib_node_registration_t ipsec_if_input_node;
+
+
+/*
+ * functions
+ */
+int ipsec_set_interface_spd(vlib_main_t * vm, u32 sw_if_index, u32 spd_id, int is_add);
+int ipsec_add_del_spd(vlib_main_t * vm, u32 spd_id, int is_add);
+int ipsec_add_del_policy(vlib_main_t * vm, ipsec_policy_t * policy, int is_add);
+int ipsec_add_del_sa(vlib_main_t * vm, ipsec_sa_t * new_sa, int is_add);
+int ipsec_set_sa_key(vlib_main_t * vm, ipsec_sa_t * sa_update);
+
+u8 * format_ipsec_if_output_trace (u8 * s, va_list * args);
+u8 * format_ipsec_policy_action (u8 * s, va_list * args);
+u8 * format_ipsec_crypto_alg (u8 * s, va_list * args);
+u8 * format_ipsec_integ_alg (u8 * s, va_list * args);
+u8 * format_ipsec_replay_window(u8 * s, va_list * args);
+uword unformat_ipsec_policy_action (unformat_input_t * input, va_list * args);
+uword unformat_ipsec_crypto_alg (unformat_input_t * input, va_list * args);
+uword unformat_ipsec_integ_alg (unformat_input_t * input, va_list * args);
+
+u32 ipsec_add_del_tunnel_if (vnet_main_t * vnm, ipsec_add_del_tunnel_args_t * args);
+int ipsec_set_interface_key(vnet_main_t * vnm, u32 hw_if_index, ipsec_if_set_key_type_t type, u8 alg, u8 * key);
+
+
+/*
+ * inline functions
+ */
+
+always_inline void
+ipsec_alloc_empty_buffers(vlib_main_t * vm, ipsec_main_t *im)
+{
+ dpdk_main_t * dm = &dpdk_main;
+ u32 free_list_index = dm->vlib_buffer_free_list_index;
+
+ uword l = vec_len (im->empty_buffers);
+ uword n_alloc = 0;
+
+ if (PREDICT_FALSE(l < VLIB_FRAME_SIZE))
+ {
+ if (!im->empty_buffers) {
+ vec_alloc (im->empty_buffers, 2 * VLIB_FRAME_SIZE );
+ }
+
+ n_alloc = vlib_buffer_alloc_from_free_list (vm, im->empty_buffers + l,
+ 2 * VLIB_FRAME_SIZE - l,
+ free_list_index);
+
+ _vec_len (im->empty_buffers) = l + n_alloc;
+ }
+}
+
+static_always_inline u32 /* FIXME move to interface???.h */
+get_next_output_feature_node_index( vnet_main_t * vnm,
+ vlib_buffer_t * b)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_node_t * node;
+ u32 r;
+ intf_output_feat_t next_feature;
+
+ u8 * node_names[] = {
+#define _(sym, str) (u8 *) str,
+ foreach_intf_output_feat
+#undef _
+ };
+
+ count_trailing_zeros(next_feature, vnet_buffer(b)->output_features.bitmap);
+
+ if (next_feature >= INTF_OUTPUT_FEAT_DONE)
+ {
+ u32 sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_TX];
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface(vnm, sw_if_index);
+ r = hw->output_node_index;
+ }
+ else
+ {
+ vnet_buffer(b)->output_features.bitmap &= ~(1 << next_feature);
+ /* FIXME */
+ node = vlib_get_node_by_name(vm, node_names[next_feature]);
+ r = node->index;
+ }
+
+ return r;
+}
diff --git a/vnet/vnet/ipsec/ipsec_cli.c b/vnet/vnet/ipsec/ipsec_cli.c
new file mode 100644
index 00000000000..0205d8bc877
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_cli.c
@@ -0,0 +1,710 @@
+/*
+ * decap.c : IPSec tunnel support
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+#include <vnet/interface.h>
+
+#include <vnet/ipsec/ipsec.h>
+
+static clib_error_t *
+set_interface_spd_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_main_t *im = &ipsec_main;
+ u32 sw_if_index = (u32) ~0;
+ u32 spd_id;
+ int is_add = 1;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ if (unformat (line_input, "%U %u", unformat_vnet_sw_interface, im->vnet_main,
+ &sw_if_index, &spd_id))
+ ;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+
+ unformat_free (line_input);
+
+ ipsec_set_interface_spd(vm, sw_if_index, spd_id, is_add);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_interface_spd_command, static) = {
+ .path = "set interface ipsec spd",
+ .short_help =
+ "set interface ipsec spd <int> <id>",
+ .function = set_interface_spd_command_fn,
+};
+
+static clib_error_t *
+ipsec_sa_add_del_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_sa_t sa;
+ int is_add = ~0;
+ u8 * ck, * ik;
+
+ memset(&sa, 0, sizeof(sa));
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "add %u", &sa.id))
+ is_add = 1;
+ else if (unformat (line_input, "del %u", &sa.id))
+ is_add = 0;
+ else if (unformat (line_input, "spi %u", &sa.spi))
+ ;
+ else if (unformat (line_input, "esp"))
+ sa.protocol = IPSEC_PROTOCOL_ESP;
+ else if (unformat (line_input, "ah"))
+ //sa.protocol = IPSEC_PROTOCOL_AH;
+ return clib_error_return(0, "unsupported security protocol 'AH'");
+ else if (unformat (line_input, "crypto-key %U", unformat_hex_string, &ck))
+ sa.crypto_key_len = vec_len (ck);
+ else if (unformat (line_input, "crypto-alg %U", unformat_ipsec_crypto_alg,
+ &sa.crypto_alg))
+ {
+ if (sa.crypto_alg < IPSEC_CRYPTO_ALG_AES_CBC_128 ||
+ sa.crypto_alg > IPSEC_CRYPTO_ALG_AES_CBC_256)
+ return clib_error_return(0, "unsupported crypto-alg: '%U'",
+ format_ipsec_crypto_alg, sa.crypto_alg);
+ }
+ else if (unformat (line_input, "integ-key %U", unformat_hex_string, &ik))
+ sa.integ_key_len = vec_len (ik);
+ else if (unformat (line_input, "integ-alg %U", unformat_ipsec_integ_alg,
+ &sa.integ_alg))
+ {
+ if (sa.integ_alg < IPSEC_INTEG_ALG_SHA1_96 ||
+ sa.integ_alg > IPSEC_INTEG_ALG_SHA_512_256)
+ return clib_error_return(0, "unsupported integ-alg: '%U'",
+ format_ipsec_integ_alg, sa.integ_alg);
+ }
+ else if (unformat (line_input, "tunnel-src %U",
+ unformat_ip4_address, &sa.tunnel_src_addr.ip4))
+ sa.is_tunnel = 1;
+ else if (unformat (line_input, "tunnel-dst %U",
+ unformat_ip4_address, &sa.tunnel_dst_addr.ip4))
+ sa.is_tunnel = 1;
+ else if (unformat (line_input, "tunnel-src %U",
+ unformat_ip6_address, &sa.tunnel_src_addr.ip6))
+ { sa.is_tunnel = 1; sa.is_tunnel_ip6 = 1; }
+ else if (unformat (line_input, "tunnel-dst %U",
+ unformat_ip6_address, &sa.tunnel_dst_addr.ip6))
+ { sa.is_tunnel = 1; sa.is_tunnel_ip6 = 1; }
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (sa.crypto_key_len > sizeof(sa.crypto_key))
+ sa.crypto_key_len = sizeof(sa.crypto_key);
+
+ if (sa.integ_key_len > sizeof(sa.integ_key))
+ sa.integ_key_len = sizeof(sa.integ_key);
+
+ if (ck)
+ strncpy((char *) sa.crypto_key, (char *) ck, sa.crypto_key_len);
+
+ if (ik)
+ strncpy((char *) sa.integ_key, (char *) ik, sa.integ_key_len);
+
+ ipsec_add_del_sa(vm, &sa, is_add);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ipsec_sa_add_del_command, static) = {
+ .path = "ipsec sa",
+ .short_help =
+ "ipsec sa [add|del]",
+ .function = ipsec_sa_add_del_command_fn,
+};
+
+static clib_error_t *
+ipsec_spd_add_del_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 spd_id;
+ int is_add = ~0;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "add"))
+ is_add = 1;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "%u", &spd_id))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ ipsec_add_del_spd(vm, spd_id, is_add);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ipsec_spd_add_del_command, static) = {
+ .path = "ipsec spd",
+ .short_help =
+ "ipsec spd [add|del] <id>",
+ .function = ipsec_spd_add_del_command_fn,
+};
+
+
+static clib_error_t *
+ipsec_policy_add_del_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_policy_t p;
+ int is_add = 0;
+ int is_ip_any = 1;
+ u32 tmp, tmp2;
+
+ memset(&p, 0, sizeof(p));
+ p.lport.stop = p.rport.stop = ~0;
+ p.laddr.stop.ip4.as_u32 = p.raddr.stop.ip4.as_u32 = (u32) ~0;
+ p.laddr.stop.ip6.as_u64[0] = p.laddr.stop.ip6.as_u64[1] = (u64) ~0;
+ p.raddr.stop.ip6.as_u64[0] = p.raddr.stop.ip6.as_u64[1] = (u64) ~0;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "add"))
+ is_add = 1;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "spd %u", &p.id))
+ ;
+ else if (unformat (line_input, "inbound"))
+ p.is_outbound = 0;
+ else if (unformat (line_input, "outbound"))
+ p.is_outbound = 1;
+ else if (unformat (line_input, "priority %d", &p.priority))
+ ;
+ else if (unformat (line_input, "protocol %u", &tmp))
+ p.protocol = (u8) tmp;
+ else if (unformat (line_input, "action %U", unformat_ipsec_policy_action,
+ &p.policy))
+ {
+ if (p.policy == IPSEC_POLICY_ACTION_RESOLVE)
+ return clib_error_return(0, "unsupported action: 'resolve'");
+ }
+ else if (unformat (line_input, "sa %u", &p.sa_id))
+ ;
+ else if (unformat (line_input, "local-ip-range %U - %U",
+ unformat_ip4_address, &p.laddr.start.ip4,
+ unformat_ip4_address, &p.laddr.stop.ip4))
+ is_ip_any = 0;
+ else if (unformat (line_input, "remote-ip-range %U - %U",
+ unformat_ip4_address, &p.raddr.start.ip4,
+ unformat_ip4_address, &p.raddr.stop.ip4))
+ is_ip_any = 0;
+ else if (unformat (line_input, "local-ip-range %U - %U",
+ unformat_ip6_address, &p.laddr.start.ip6,
+ unformat_ip6_address, &p.laddr.stop.ip6))
+ {
+ p.is_ipv6 = 1;
+ is_ip_any = 0;
+ }
+ else if (unformat (line_input, "remote-ip-range %U - %U",
+ unformat_ip6_address, &p.raddr.start.ip6,
+ unformat_ip6_address, &p.raddr.stop.ip6))
+ {
+ p.is_ipv6 = 1;
+ is_ip_any = 0;
+ }
+ else if (unformat (line_input, "local-port-range %u - %u", &tmp, &tmp2))
+ { p.lport.start = tmp; p.lport.stop = tmp2; }
+ else if (unformat (line_input, "remote-port-range %u - %u", &tmp, &tmp2))
+ { p.rport.start = tmp; p.rport.stop = tmp2; }
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ ipsec_add_del_policy(vm, &p, is_add);
+ if (is_ip_any)
+ {
+ p.is_ipv6 = 1;
+ ipsec_add_del_policy(vm, &p, is_add);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (ipsec_policy_add_del_command, static) = {
+ .path = "ipsec policy",
+ .short_help =
+ "ipsec policy [add|del] spd <id> priority <n> ",
+ .function = ipsec_policy_add_del_command_fn,
+};
+
+static clib_error_t *
+set_ipsec_sa_key_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_sa_t sa;
+ u8 * ck, * ik;
+
+ memset(&sa, 0, sizeof(sa));
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "%u", &sa.id))
+ ;
+ else if (unformat (line_input, "crypto-key %U", unformat_hex_string, &ck))
+ sa.crypto_key_len = vec_len (ck);
+ else if (unformat (line_input, "integ-key %U", unformat_hex_string, &ik))
+ sa.integ_key_len = vec_len (ik);
+ else
+ return clib_error_return (0, "parse error: '%U'", format_unformat_error,
+ line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (sa.crypto_key_len > sizeof(sa.crypto_key))
+ sa.crypto_key_len = sizeof(sa.crypto_key);
+
+ if (sa.integ_key_len > sizeof(sa.integ_key))
+ sa.integ_key_len = sizeof(sa.integ_key);
+
+ if (ck)
+ strncpy((char *) sa.crypto_key, (char *) ck, sa.crypto_key_len);
+
+ if (ik)
+ strncpy((char *) sa.integ_key, (char *) ik, sa.integ_key_len);
+
+ ipsec_set_sa_key(vm, &sa);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ipsec_sa_key_command, static) = {
+ .path = "set ipsec sa",
+ .short_help =
+ "set ipsec sa <id> crypto-key <key> integ-key <key>",
+ .function = set_ipsec_sa_key_command_fn,
+};
+
+static clib_error_t *
+show_ipsec_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ipsec_spd_t * spd;
+ ipsec_sa_t * sa;
+ ipsec_policy_t * p;
+ ipsec_main_t * im = &ipsec_main;
+ u32 * i;
+ ipsec_tunnel_if_t * t;
+ vnet_hw_interface_t * hi;
+
+ pool_foreach (sa, im->sad, ({
+ if (sa->id) {
+ vlib_cli_output(vm, "sa %u spi %u mode %s protocol %s", sa->id, sa->spi,
+ sa->is_tunnel ? "tunnel" : "transport",
+ sa->protocol ? "esp" : "ah");
+ if (sa->protocol == IPSEC_PROTOCOL_ESP) {
+ vlib_cli_output(vm, " crypto alg %U%s%U integrity alg %U%s%U",
+ format_ipsec_crypto_alg, sa->crypto_alg,
+ sa->crypto_alg ? " key " : "",
+ format_hex_bytes, sa->crypto_key, sa->crypto_key_len,
+ format_ipsec_integ_alg, sa->integ_alg,
+ sa->integ_alg ? " key " : "",
+ format_hex_bytes, sa->integ_key, sa->integ_key_len);
+ }
+ if (sa->is_tunnel && sa->is_tunnel_ip6) {
+ vlib_cli_output(vm, " tunnel src %U dst %U",
+ format_ip6_address, &sa->tunnel_src_addr.ip6,
+ format_ip6_address, &sa->tunnel_dst_addr.ip6);
+ } else if (sa->is_tunnel) {
+ vlib_cli_output(vm, " tunnel src %U dst %U",
+ format_ip4_address, &sa->tunnel_src_addr.ip4,
+ format_ip4_address, &sa->tunnel_dst_addr.ip4);
+ }
+ }
+ }));
+
+ pool_foreach (spd, im->spds, ({
+ vlib_cli_output(vm, "spd %u", spd->id);
+
+ vlib_cli_output(vm, " outbound policies");
+ vec_foreach(i, spd->ipv4_outbound_policies)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->laddr.start.ip4,
+ format_ip4_address, &p->laddr.stop.ip4,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remte addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->raddr.start.ip4,
+ format_ip4_address, &p->raddr.stop.ip4,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ vec_foreach(i, spd->ipv6_outbound_policies)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->laddr.start.ip6,
+ format_ip6_address, &p->laddr.stop.ip6,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remote addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->raddr.start.ip6,
+ format_ip6_address, &p->raddr.stop.ip6,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ vlib_cli_output(vm, " inbound policies");
+ vec_foreach(i, spd->ipv4_inbound_protect_policy_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->laddr.start.ip4,
+ format_ip4_address, &p->laddr.stop.ip4,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remte addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->raddr.start.ip4,
+ format_ip4_address, &p->raddr.stop.ip4,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ vec_foreach(i, spd->ipv4_inbound_policy_discard_and_bypass_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->laddr.start.ip4,
+ format_ip4_address, &p->laddr.stop.ip4,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remte addr range %U - %U port range %u - %u",
+ format_ip4_address, &p->raddr.start.ip4,
+ format_ip4_address, &p->raddr.stop.ip4,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ vec_foreach(i, spd->ipv6_inbound_protect_policy_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->laddr.start.ip6,
+ format_ip6_address, &p->laddr.stop.ip6,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remote addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->raddr.start.ip6,
+ format_ip6_address, &p->raddr.stop.ip6,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ vec_foreach(i, spd->ipv6_inbound_policy_discard_and_bypass_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ vlib_cli_output(vm, " priority %d action %U protocol %s%s",
+ p->priority,
+ format_ipsec_policy_action, p->policy,
+ p->protocol ?
+ format(0, "%U", format_ip_protocol, p->protocol) :
+ (u8 *) "any",
+ p->policy == IPSEC_POLICY_ACTION_PROTECT ?
+ format(0, " sa %u", p->sa_id) :
+ (u8 *) "");
+ vlib_cli_output(vm, " local addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->laddr.start.ip6,
+ format_ip6_address, &p->laddr.stop.ip6,
+ p->lport.start, p->lport.stop);
+ vlib_cli_output(vm, " remote addr range %U - %U port range %u - %u",
+ format_ip6_address, &p->raddr.start.ip6,
+ format_ip6_address, &p->raddr.stop.ip6,
+ p->rport.start, p->rport.stop);
+ vlib_cli_output(vm, " packets %u bytes %u", p->counter.packets,
+ p->counter.bytes);
+ };
+ }));
+
+ vlib_cli_output(vm, "tunnel interfaces");
+ pool_foreach (t, im->tunnel_interfaces, ({
+ hi = vnet_get_hw_interface (im->vnet_main, t->hw_if_index);
+ vlib_cli_output(vm, " %s seq", hi->name);
+ sa = pool_elt_at_index(im->sad, t->output_sa_index);
+ vlib_cli_output(vm, " seq %u seq-hi %u esn %u anti-replay %u",
+ sa->seq, sa->seq_hi, sa->use_esn, sa->use_anti_replay);
+ vlib_cli_output(vm, " local-spi %u local-ip %U", sa->spi,
+ format_ip4_address, &sa->tunnel_src_addr.ip4);
+ vlib_cli_output(vm, " local-crypto %U %U",
+ format_ipsec_crypto_alg, sa->crypto_alg,
+ format_hex_bytes, sa->crypto_key, sa->crypto_key_len);
+ vlib_cli_output(vm, " local-integrity %U %U",
+ format_ipsec_integ_alg, sa->integ_alg,
+ format_hex_bytes, sa->integ_key, sa->integ_key_len);
+ sa = pool_elt_at_index(im->sad, t->input_sa_index);
+ vlib_cli_output(vm, " last-seq %u last-seq-hi %u esn %u anti-replay %u window %U",
+ sa->last_seq, sa->last_seq_hi, sa->use_esn,
+ sa->use_anti_replay,
+ format_ipsec_replay_window, sa->replay_window);
+ vlib_cli_output(vm, " remote-spi %u remote-ip %U", sa->spi,
+ format_ip4_address, &sa->tunnel_src_addr.ip4);
+ vlib_cli_output(vm, " remote-crypto %U %U",
+ format_ipsec_crypto_alg, sa->crypto_alg,
+ format_hex_bytes, sa->crypto_key, sa->crypto_key_len);
+ vlib_cli_output(vm, " remote-integrity %U %U",
+ format_ipsec_integ_alg, sa->integ_alg,
+ format_hex_bytes, sa->integ_key, sa->integ_key_len);
+ }));
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_ipsec_command, static) = {
+ .path = "show ipsec",
+ .short_help = "show ipsec",
+ .function = show_ipsec_command_fn,
+};
+
+static clib_error_t *
+clear_ipsec_counters_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ipsec_main_t * im = &ipsec_main;
+ ipsec_spd_t * spd;
+ ipsec_policy_t * p;
+
+ pool_foreach (spd, im->spds, ({
+ pool_foreach(p, spd->policies, ({
+ p->counter.packets = p->counter.bytes = 0;
+ }));
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_ipsec_counters_command, static) = {
+ .path = "clear ipsec counters",
+ .short_help = "clear ipsec counters",
+ .function = clear_ipsec_counters_command_fn,
+};
+
+static clib_error_t *
+create_ipsec_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_add_del_tunnel_args_t a;
+ ipsec_main_t *im = &ipsec_main;
+ int rv;
+ u32 num_m_args = 0;
+ a.is_add = 1;
+ a.anti_replay = 0;
+ a.esn = 0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "local-ip %U", unformat_ip4_address, &a.local_ip))
+ num_m_args++;
+ else if (unformat (line_input, "remote-ip %U", unformat_ip4_address, &a.remote_ip))
+ num_m_args++;
+ else if (unformat (line_input, "local-spi %u", &a.local_spi))
+ num_m_args++;
+ else if (unformat (line_input, "remote-spi %u", &a.remote_spi))
+ num_m_args++;
+ else if (unformat (line_input, "del"))
+ a.is_add = 0;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free (line_input);
+
+ if (num_m_args < 4)
+ return clib_error_return (0, "mandatory argument(s) missing");
+
+ rv = ipsec_add_del_tunnel_if (im->vnet_main, &a);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_VALUE:
+ if (a.is_add)
+ return clib_error_return (0, "IPSec tunnel interface already exists...");
+ else
+ return clib_error_return (0, "IPSec tunnel interface not exists...");
+ default:
+ return clib_error_return (0, "ipsec_register_interface returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_ipsec_tunnel_command, static) = {
+ .path = "create ipsec tunnel",
+ .short_help = "create ipsec tunnel local-ip <addr> local-spi <spi> remote-ip <addr> remote-spi <spi>",
+ .function = create_ipsec_tunnel_command_fn,
+};
+
+static clib_error_t *
+set_interface_key_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_if_set_key_type_t type = IPSEC_IF_SET_KEY_TYPE_NONE;
+ u32 hw_if_index = (u32) ~0;
+ u32 alg;
+ u8 * key = 0;
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "%U",
+ unformat_vnet_hw_interface, im->vnet_main, &hw_if_index))
+ ;
+ else if (unformat (line_input, "local crypto %U", unformat_ipsec_crypto_alg, &alg))
+ type = IPSEC_IF_SET_KEY_TYPE_LOCAL_CRYPTO;
+ else if (unformat (line_input, "remote crypto %U", unformat_ipsec_crypto_alg, &alg))
+ type = IPSEC_IF_SET_KEY_TYPE_REMOTE_CRYPTO;
+ else if (unformat (line_input, "local integ %U", unformat_ipsec_integ_alg, &alg))
+ type = IPSEC_IF_SET_KEY_TYPE_LOCAL_INTEG;
+ else if (unformat (line_input, "remote integ %U", unformat_ipsec_integ_alg, &alg))
+ type = IPSEC_IF_SET_KEY_TYPE_REMOTE_INTEG;
+ else if (unformat (line_input, "%U", unformat_hex_string, &key))
+ ;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (type == IPSEC_IF_SET_KEY_TYPE_NONE)
+ return clib_error_return (0, "unknown key type");
+
+ if (alg > 0 && vec_len(key)==0)
+ return clib_error_return (0, "key is not specified");
+
+ if (hw_if_index == (u32) ~0)
+ return clib_error_return (0, "interface not specified");
+
+ ipsec_set_interface_key(im->vnet_main, hw_if_index, type, alg, key);
+ vec_free(key);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_interface_key_command, static) = {
+ .path = "set interface ipsec key",
+ .short_help =
+ "set interface ipsec key <int> <local|remote> <crypto|integ> <key type> <key>",
+ .function = set_interface_key_command_fn,
+};
+
+
+clib_error_t *
+ipsec_cli_init (vlib_main_t * vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ipsec_cli_init);
+
diff --git a/vnet/vnet/ipsec/ipsec_format.c b/vnet/vnet/ipsec/ipsec_format.c
new file mode 100644
index 00000000000..f3720abf6c3
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_format.c
@@ -0,0 +1,133 @@
+/*
+ * decap.c : IPSec tunnel support
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+#include <vnet/interface.h>
+
+#include <vnet/ipsec/ipsec.h>
+
+u8 *
+format_ipsec_policy_action (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ char * t = 0;
+
+ switch (i)
+ {
+#define _(v,f,str) case IPSEC_POLICY_ACTION_##f: t = str; break;
+ foreach_ipsec_policy_action
+#undef _
+ default:
+ s = format (s, "unknown");
+ }
+ s = format (s, "%s", t);
+ return s;
+}
+
+uword
+unformat_ipsec_policy_action (unformat_input_t * input, va_list * args)
+{
+ u32 * r = va_arg (*args, u32 *);
+
+ if (0) ;
+#define _(v,f,s) else if (unformat (input, s)) *r = IPSEC_POLICY_ACTION_##f;
+ foreach_ipsec_policy_action
+#undef _
+ else
+ return 0;
+ return 1;
+}
+
+u8 *
+format_ipsec_crypto_alg (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u8 * t = 0;
+
+ switch (i)
+ {
+#define _(v,f,str) case IPSEC_CRYPTO_ALG_##f: t = (u8 *) str; break;
+ foreach_ipsec_crypto_alg
+#undef _
+ default:
+ s = format (s, "unknown");
+ }
+ s = format (s, "%s", t);
+ return s;
+}
+
+uword
+unformat_ipsec_crypto_alg (unformat_input_t * input, va_list * args)
+{
+ u32 * r = va_arg (*args, u32 *);
+
+ if (0) ;
+#define _(v,f,s) else if (unformat (input, s)) *r = IPSEC_CRYPTO_ALG_##f;
+ foreach_ipsec_crypto_alg
+#undef _
+ else
+ return 0;
+ return 1;
+}
+
+u8 *
+format_ipsec_integ_alg (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u8 * t = 0;
+
+ switch (i)
+ {
+#define _(v,f,str) case IPSEC_INTEG_ALG_##f: t = (u8 *) str; break;
+ foreach_ipsec_integ_alg
+#undef _
+ default:
+ s = format (s, "unknown");
+ }
+ s = format (s, "%s", t);
+ return s;
+}
+
+uword
+unformat_ipsec_integ_alg (unformat_input_t * input, va_list * args)
+{
+ u32 * r = va_arg (*args, u32 *);
+
+ if (0) ;
+#define _(v,f,s) else if (unformat (input, s)) *r = IPSEC_INTEG_ALG_##f;
+ foreach_ipsec_integ_alg
+#undef _
+ else
+ return 0;
+ return 1;
+}
+
+u8 *
+format_ipsec_replay_window(u8 * s, va_list * args)
+{
+ u64 w = va_arg (*args, u64);
+ u8 i;
+
+ for (i = 0; i < 64; i++)
+ {
+ s = format (s, "%u", w & (1ULL<<i) ? 1 : 0);
+ }
+
+ return s;
+}
diff --git a/vnet/vnet/ipsec/ipsec_if.c b/vnet/vnet/ipsec/ipsec_if.c
new file mode 100644
index 00000000000..f4c535840d2
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_if.c
@@ -0,0 +1,199 @@
+/*
+ * ipsec_if.c : IPSec interface support
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+
+static u8 * format_ipsec_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "ipsec%d", dev_instance);
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (ipsec_device_class,static) = {
+ .name = "IPSec",
+ .format_device_name = format_ipsec_name,
+ .format_tx_trace = format_ipsec_if_output_trace,
+ .tx_function = dummy_interface_tx,
+};
+
+VNET_HW_INTERFACE_CLASS (ipsec_hw_class) = {
+ .name = "IPSec",
+};
+
+u32
+ipsec_add_del_tunnel_if (vnet_main_t * vnm, ipsec_add_del_tunnel_args_t * args)
+{
+ ipsec_tunnel_if_t * t;
+ ipsec_main_t * im = &ipsec_main;
+ vnet_hw_interface_t * hi;
+ u32 hw_if_index = ~0;
+ uword *p;
+ ipsec_sa_t * sa;
+
+ u64 key = (u64) args->remote_ip.as_u32 << 32 | (u64) args->remote_spi;
+ p = hash_get (im->ipsec_if_pool_index_by_key, key);
+
+ if (args->is_add)
+ {
+ /* check if same src/dst pair exists */
+ if (p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ pool_get_aligned (im->tunnel_interfaces, t, CLIB_CACHE_LINE_BYTES);
+ memset (t, 0, sizeof (*t));
+
+ pool_get (im->sad, sa);
+ memset (sa, 0, sizeof (*sa));
+ t->input_sa_index = sa - im->sad;
+ sa->spi = args->remote_spi;
+ sa->tunnel_src_addr.ip4.as_u32 = args->remote_ip.as_u32;
+ sa->tunnel_dst_addr.ip4.as_u32 = args->local_ip.as_u32;
+ sa->is_tunnel = 1;
+ sa->use_esn = args->esn;
+ sa->use_anti_replay = args->anti_replay;
+
+ pool_get (im->sad, sa);
+ memset (sa, 0, sizeof (*sa));
+ t->output_sa_index = sa - im->sad;
+ sa->spi = args->local_spi;
+ sa->tunnel_src_addr.ip4.as_u32 = args->local_ip.as_u32;
+ sa->tunnel_dst_addr.ip4.as_u32 = args->remote_ip.as_u32;
+ sa->is_tunnel = 1;
+ sa->seq = 1;
+ sa->use_esn = args->esn;
+ sa->use_anti_replay = args->anti_replay;
+
+ hash_set (im->ipsec_if_pool_index_by_key, key, t - im->tunnel_interfaces);
+
+ if (vec_len (im->free_tunnel_if_indices) > 0)
+ {
+ hw_if_index =
+ im->free_tunnel_if_indices[vec_len(im->free_tunnel_if_indices)-1];
+ _vec_len (im->free_tunnel_if_indices) -= 1;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface(vnm, ipsec_device_class.index,
+ t - im->tunnel_interfaces,
+ ipsec_hw_class.index,
+ t - im->tunnel_interfaces);
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = ipsec_if_output_node.index;
+ }
+ t->hw_if_index = hw_if_index;
+
+ /*1st interface, register protocol */
+ if (pool_elts(im->tunnel_interfaces) == 1)
+ ip4_register_protocol(IP_PROTOCOL_IPSEC_ESP, ipsec_if_input_node.index);
+
+ return hw_if_index;
+ }
+ else
+ {
+ /* check if exists */
+ if (!p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ t = pool_elt_at_index(im->tunnel_interfaces, p[0]);
+ hi = vnet_get_hw_interface (vnm, t->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index, 0); /* admin down */
+ vec_add1 (im->free_tunnel_if_indices, t->hw_if_index);
+
+ /* delete input and output SA */
+ sa = pool_elt_at_index(im->sad, t->input_sa_index);
+ pool_put (im->sad, sa);
+ sa = pool_elt_at_index(im->sad, t->output_sa_index);
+ pool_put (im->sad, sa);
+
+ hash_unset (im->ipsec_if_pool_index_by_key, key);
+ pool_put (im->tunnel_interfaces, t);
+ }
+ return 0;
+}
+
+int
+ipsec_set_interface_key(vnet_main_t * vnm, u32 hw_if_index,
+ ipsec_if_set_key_type_t type, u8 alg, u8 * key)
+{
+ ipsec_main_t * im = &ipsec_main;
+ vnet_hw_interface_t * hi;
+ ipsec_tunnel_if_t * t;
+ ipsec_sa_t * sa;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ t = pool_elt_at_index (im->tunnel_interfaces, hi->dev_instance);
+
+ if (type == IPSEC_IF_SET_KEY_TYPE_LOCAL_CRYPTO)
+ {
+ sa = pool_elt_at_index(im->sad, t->output_sa_index);
+ sa->crypto_alg = alg;
+ sa->crypto_key_len = vec_len(key);
+ memcpy(sa->crypto_key, key, vec_len(key));
+ }
+ else if (type == IPSEC_IF_SET_KEY_TYPE_LOCAL_INTEG)
+ {
+ sa = pool_elt_at_index(im->sad, t->output_sa_index);
+ sa->integ_alg = alg;
+ sa->integ_key_len = vec_len(key);
+ memcpy(sa->integ_key, key, vec_len(key));
+ }
+ else if (type == IPSEC_IF_SET_KEY_TYPE_REMOTE_CRYPTO)
+ {
+ sa = pool_elt_at_index(im->sad, t->input_sa_index);
+ sa->crypto_alg = alg;
+ sa->crypto_key_len = vec_len(key);
+ memcpy(sa->crypto_key, key, vec_len(key));
+ }
+ else if (type == IPSEC_IF_SET_KEY_TYPE_REMOTE_INTEG)
+ {
+ sa = pool_elt_at_index(im->sad, t->input_sa_index);
+ sa->integ_alg = alg;
+ sa->integ_key_len = vec_len(key);
+ memcpy(sa->integ_key, key, vec_len(key));
+ }
+ else
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ return 0;
+}
+
+
+clib_error_t *
+ipsec_tunnel_if_init (vlib_main_t * vm)
+{
+ ipsec_main_t * im = &ipsec_main;
+
+ im->ipsec_if_pool_index_by_key = hash_create (0, sizeof (uword));
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ipsec_tunnel_if_init);
+
diff --git a/vnet/vnet/ipsec/ipsec_if_in.c b/vnet/vnet/ipsec/ipsec_if_in.c
new file mode 100644
index 00000000000..517f8bff7b2
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_if_in.c
@@ -0,0 +1,151 @@
+/*
+ * ipsec_if_in.c : IPSec interface input node
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+/* Statistics (not really errors) */
+#define foreach_ipsec_if_input_error \
+_(RX, "good packets received")
+
+static char * ipsec_if_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ipsec_if_input_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) IPSEC_IF_INPUT_ERROR_##sym,
+ foreach_ipsec_if_input_error
+#undef _
+ IPSEC_IF_INPUT_N_ERROR,
+} ipsec_if_input_error_t;
+
+typedef enum {
+ IPSEC_IF_INPUT_NEXT_ESP_DECRYPT,
+ IPSEC_IF_INPUT_NEXT_DROP,
+ IPSEC_IF_INPUT_N_NEXT,
+} ipsec_if_input_next_t;
+
+typedef struct {
+ u32 spi;
+ u32 seq;
+} ipsec_if_input_trace_t;
+
+
+u8 * format_ipsec_if_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ipsec_if_input_trace_t * t
+ = va_arg (*args, ipsec_if_input_trace_t *);
+
+ s = format (s, "IPSec: spi %u seq %u", t->spi, t->seq);
+ return s;
+}
+
+static uword
+ipsec_if_input_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ipsec_main_t *im = &ipsec_main;
+ u32 * from, * to_next = 0, next_index;
+ u32 n_left_from;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t * b0;
+ ip4_header_t *ip0;
+ esp_header_t *esp0;
+ uword * p;
+
+ bi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+ esp0 = (esp_header_t *) ((u8 *) ip0 + ip4_header_bytes (ip0));
+
+ next0 = IPSEC_IF_INPUT_NEXT_DROP;
+
+ u64 key = (u64) ip0->src_address.as_u32 << 32 |
+ (u64) clib_net_to_host_u32(esp0->spi);
+
+ p = hash_get (im->ipsec_if_pool_index_by_key, key);
+
+ if (p)
+ {
+ ipsec_tunnel_if_t * t;
+ t = pool_elt_at_index(im->tunnel_interfaces, p[0]);
+ vnet_buffer(b0)->output_features.ipsec_sad_index = t->input_sa_index;
+ vlib_buffer_advance(b0, ip4_header_bytes (ip0));
+ next0 = IPSEC_IF_INPUT_NEXT_ESP_DECRYPT;
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ipsec_if_input_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->spi = clib_host_to_net_u32(esp0->spi);
+ tr->seq = clib_host_to_net_u32(esp0->seq);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ipsec_if_input_node.index,
+ IPSEC_IF_INPUT_ERROR_RX,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ipsec_if_input_node) = {
+ .function = ipsec_if_input_node_fn,
+ .name = "ipsec-if-input",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ipsec_if_input_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ipsec_if_input_error_strings),
+ .error_strings = ipsec_if_input_error_strings,
+
+ .n_next_nodes = IPSEC_IF_INPUT_N_NEXT,
+
+ .next_nodes = {
+ [IPSEC_IF_INPUT_NEXT_ESP_DECRYPT] = "esp-decrypt",
+ [IPSEC_IF_INPUT_NEXT_DROP] = "error-drop",
+ },
+}; \ No newline at end of file
diff --git a/vnet/vnet/ipsec/ipsec_if_out.c b/vnet/vnet/ipsec/ipsec_if_out.c
new file mode 100644
index 00000000000..1e1dd52854b
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_if_out.c
@@ -0,0 +1,140 @@
+/*
+ * ipsec_if_out.c : IPSec interface output node
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+
+
+/* Statistics (not really errors) */
+#define foreach_ipsec_if_output_error \
+_(TX, "good packets transmitted")
+
+static char * ipsec_if_output_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ipsec_if_output_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) IPSEC_IF_OUTPUT_ERROR_##sym,
+ foreach_ipsec_if_output_error
+#undef _
+ IPSEC_IF_OUTPUT_N_ERROR,
+} ipsec_if_output_error_t;
+
+typedef enum {
+ IPSEC_IF_OUTPUT_NEXT_ESP_ENCRYPT,
+ IPSEC_IF_OUTPUT_NEXT_DROP,
+ IPSEC_IF_OUTPUT_N_NEXT,
+} ipsec_if_output_next_t;
+
+typedef struct {
+ u32 spi;
+ u32 seq;
+} ipsec_if_output_trace_t;
+
+
+u8 * format_ipsec_if_output_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ipsec_if_output_trace_t * t
+ = va_arg (*args, ipsec_if_output_trace_t *);
+
+ s = format (s, "IPSec: spi %u seq %u", t->spi, t->seq);
+ return s;
+}
+
+static uword
+ipsec_if_output_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ipsec_main_t *im = &ipsec_main;
+ vnet_main_t * vnm = im->vnet_main;
+ u32 * from, * to_next = 0, next_index;
+ u32 n_left_from, sw_if_index0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t * b0;
+ ipsec_tunnel_if_t * t0;
+ vnet_hw_interface_t * hi0;
+
+ bi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ b0 = vlib_get_buffer (vm, bi0);
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+ t0 = pool_elt_at_index (im->tunnel_interfaces, hi0->dev_instance);
+ vnet_buffer(b0)->output_features.ipsec_sad_index = t0->output_sa_index;
+ next0 = IPSEC_IF_OUTPUT_NEXT_ESP_ENCRYPT;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ipsec_if_output_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ ipsec_sa_t * sa0 = pool_elt_at_index(im->sad, t0->output_sa_index);
+ tr->spi = sa0->spi;
+ tr->seq = sa0->seq;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, ipsec_if_output_node.index,
+ IPSEC_IF_OUTPUT_ERROR_TX,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ipsec_if_output_node) = {
+ .function = ipsec_if_output_node_fn,
+ .name = "ipsec-if-output",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ipsec_if_output_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ipsec_if_output_error_strings),
+ .error_strings = ipsec_if_output_error_strings,
+
+ .n_next_nodes = IPSEC_IF_OUTPUT_N_NEXT,
+
+ .next_nodes = {
+ [IPSEC_IF_OUTPUT_NEXT_ESP_ENCRYPT] = "esp-encrypt",
+ [IPSEC_IF_OUTPUT_NEXT_DROP] = "error-drop",
+ },
+};
+
diff --git a/vnet/vnet/ipsec/ipsec_input.c b/vnet/vnet/ipsec/ipsec_input.c
new file mode 100644
index 00000000000..abb4a47485a
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_input.c
@@ -0,0 +1,406 @@
+/*
+ * decap.c : IPSec tunnel decapsulation
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+#include <vnet/ipsec/esp.h>
+
+#define foreach_ipsec_input_next \
+_(DROP, "error-drop") \
+_(ESP_DECRYPT, "esp-decrypt")
+
+#define _(v, s) IPSEC_INPUT_NEXT_##v,
+typedef enum {
+ foreach_ipsec_input_next
+#undef _
+ IPSEC_INPUT_N_NEXT,
+} ipsec_input_next_t;
+
+
+#define foreach_ipsec_input_error \
+ _(RX_PKTS, "IPSEC pkts received") \
+ _(DECRYPTION_FAILED, "IPSEC decryption failed")
+
+
+typedef enum {
+#define _(sym,str) IPSEC_INPUT_ERROR_##sym,
+ foreach_ipsec_input_error
+#undef _
+ IPSEC_INPUT_N_ERROR,
+} ipsec_input_error_t;
+
+static char * ipsec_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ipsec_input_error
+#undef _
+};
+
+vlib_node_registration_t ipsec_input_node;
+
+typedef struct {
+ u32 tunnel_index;
+ u32 spi;
+ u32 seq;
+} ipsec_input_trace_t;
+
+/* packet trace format function */
+static u8 * format_ipsec_input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ipsec_input_trace_t * t = va_arg (*args, ipsec_input_trace_t *);
+
+ if (t->tunnel_index != ~0)
+ {
+ s = format (s, "esp: tunnel %u spi %u seq %u", t->tunnel_index, t->spi, t->seq);
+ }
+ else
+ {
+ s = format (s, "esp: no tunnel spi %u seq %u",t->spi, t->seq);
+ }
+ return s;
+}
+
+always_inline ipsec_policy_t *
+ipsec_input_protect_policy_match(ipsec_spd_t * spd, u32 sa, u32 da, u32 spi)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_policy_t * p;
+ ipsec_sa_t * s;
+ u32 * i;
+
+ vec_foreach(i, spd->ipv4_inbound_protect_policy_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ s = pool_elt_at_index(im->sad, p->sa_index);
+
+ if (spi != s->spi)
+ continue;
+
+ if (s->is_tunnel)
+ {
+ if (da != clib_net_to_host_u32(s->tunnel_dst_addr.ip4.as_u32))
+ continue;
+
+ if (sa != clib_net_to_host_u32(s->tunnel_src_addr.ip4.as_u32))
+ continue;
+
+ return p;
+ }
+
+ if (da < clib_net_to_host_u32(p->laddr.start.ip4.as_u32))
+ continue;
+
+ if (da > clib_net_to_host_u32(p->laddr.stop.ip4.as_u32))
+ continue;
+
+ if (sa < clib_net_to_host_u32(p->raddr.start.ip4.as_u32))
+ continue;
+
+ if (sa > clib_net_to_host_u32(p->raddr.stop.ip4.as_u32))
+ continue;
+
+ return p;
+ }
+ return 0;
+}
+
+always_inline uword
+ip6_addr_match_range (ip6_address_t * a, ip6_address_t * la, ip6_address_t * ua)
+{
+ if ((memcmp(a->as_u64, la->as_u64, 2 * sizeof(u64)) >= 0) &&
+ (memcmp(a->as_u64, ua->as_u64, 2 * sizeof(u64)) <= 0))
+ return 1;
+ return 0;
+}
+
+always_inline ipsec_policy_t *
+ipsec_input_ip6_protect_policy_match (ipsec_spd_t * spd,
+ ip6_address_t * sa,
+ ip6_address_t * da,
+ u32 spi)
+{
+ ipsec_main_t *im = &ipsec_main;
+ ipsec_policy_t * p;
+ ipsec_sa_t * s;
+ u32 * i;
+
+ vec_foreach(i, spd->ipv6_inbound_protect_policy_indices)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ s = pool_elt_at_index(im->sad, p->sa_index);
+
+ if (spi != s->spi)
+ continue;
+
+ if (s->is_tunnel)
+ {
+ if (!ip6_address_is_equal(sa, &s->tunnel_src_addr.ip6))
+ continue;
+
+ if (!ip6_address_is_equal(da, &s->tunnel_dst_addr.ip6))
+ continue;
+
+ return p;
+ }
+
+ if (!ip6_addr_match_range(sa, &p->raddr.start.ip6, &p->raddr.stop.ip6))
+ continue;
+
+ if (!ip6_addr_match_range(da, &p->laddr.start.ip6, &p->laddr.stop.ip6))
+ continue;
+
+ return p;
+ }
+ return 0;
+}
+
+static uword
+ipsec_input_ip4_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ip4_main_t * i4m = &ip4_main;
+ ip_lookup_main_t * lm = &i4m->lookup_main;
+ ip_config_main_t * cm = &lm->rx_config_mains[VNET_UNICAST];
+ u32 n_left_from, *from, next_index, *to_next;
+ ipsec_main_t *im = &ipsec_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t * b0;
+ ip4_header_t *ip0;
+ esp_header_t *esp0;
+ ip4_ipsec_config_t * c0;
+ u32 tunnel_index0 = ~0;
+ ipsec_spd_t * spd0;
+
+ bi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ c0 = vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (b0)->ip.current_config_index,
+ &next0, sizeof (c0[0]));
+
+ spd0 = pool_elt_at_index(im->spds, c0->spd_index);
+
+ ip0 = vlib_buffer_get_current (b0);
+ esp0 = (esp_header_t *) ((u8 *) ip0 + ip4_header_bytes (ip0));
+
+ if (PREDICT_TRUE(ip0->protocol == IP_PROTOCOL_IPSEC_ESP))
+ {
+#if 0
+ clib_warning("packet received from %U to %U spi %u size %u spd_id %u",
+ format_ip4_address, ip0->src_address.as_u8,
+ format_ip4_address, ip0->dst_address.as_u8,
+ clib_net_to_host_u32(esp0->spi),
+ clib_net_to_host_u16(ip0->length),
+ spd0->id);
+#endif
+ ipsec_policy_t * p0;
+ p0 = ipsec_input_protect_policy_match(spd0,
+ clib_net_to_host_u32(ip0->src_address.as_u32),
+ clib_net_to_host_u32(ip0->dst_address.as_u32),
+ clib_net_to_host_u32(esp0->spi));
+
+ if (PREDICT_TRUE(p0 != 0))
+ {
+ p0->counter.packets++;
+ p0->counter.bytes += clib_net_to_host_u16(ip0->length);
+ vnet_buffer(b0)->output_features.ipsec_sad_index = p0->sa_index;
+ next0 = IPSEC_INPUT_NEXT_ESP_DECRYPT;
+ vlib_buffer_advance(b0, ip4_header_bytes (ip0));
+ goto trace0;
+ }
+ }
+
+ /* FIXME bypass and discard */
+
+trace0:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ipsec_input_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = tunnel_index0;
+ tr->spi = clib_host_to_net_u32(esp0->spi);
+ tr->seq = clib_host_to_net_u32(esp0->seq);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, ipsec_input_ip4_node.index,
+ IPSEC_INPUT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (ipsec_input_ip4_node) = {
+ .function = ipsec_input_ip4_node_fn,
+ .name = "ipsec-input-ip4",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ipsec_input_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ipsec_input_error_strings),
+ .error_strings = ipsec_input_error_strings,
+
+ .n_next_nodes = IPSEC_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [IPSEC_INPUT_NEXT_##s] = n,
+ foreach_ipsec_input_next
+#undef _
+ },
+};
+
+
+static uword
+ipsec_input_ip6_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ip6_main_t * i6m = &ip6_main;
+ ip_lookup_main_t * lm = &i6m->lookup_main;
+ ip_config_main_t * cm = &lm->rx_config_mains[VNET_UNICAST];
+ u32 n_left_from, *from, next_index, *to_next;
+ ipsec_main_t *im = &ipsec_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, next0;
+ vlib_buffer_t * b0;
+ ip6_header_t *ip0;
+ esp_header_t *esp0;
+ ip4_ipsec_config_t * c0;
+ u32 tunnel_index0 = ~0;
+ ipsec_spd_t * spd0;
+ u32 header_size = sizeof(ip0[0]);
+
+ bi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ c0 = vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (b0)->ip.current_config_index,
+ &next0, sizeof (c0[0]));
+
+ spd0 = pool_elt_at_index(im->spds, c0->spd_index);
+
+ ip0 = vlib_buffer_get_current (b0);
+ esp0 = (esp_header_t *) ((u8 *) ip0 + header_size);
+
+ if (PREDICT_TRUE(ip0->protocol == IP_PROTOCOL_IPSEC_ESP))
+ {
+#if 0
+ clib_warning("packet received from %U to %U spi %u size %u spd_id %u",
+ format_ip6_address, &ip0->src_address,
+ format_ip6_address, &ip0->dst_address,
+ clib_net_to_host_u32(esp0->spi),
+ clib_net_to_host_u16(ip0->payload_length) + header_size,
+ spd0->id);
+#endif
+ ipsec_policy_t * p0;
+ p0 = ipsec_input_ip6_protect_policy_match(spd0,
+ &ip0->src_address,
+ &ip0->dst_address,
+ clib_net_to_host_u32(esp0->spi));
+
+ if (PREDICT_TRUE(p0 != 0))
+ {
+ p0->counter.packets++;
+ p0->counter.bytes += clib_net_to_host_u16(ip0->payload_length);
+ p0->counter.bytes += header_size;
+ vnet_buffer(b0)->output_features.ipsec_sad_index = p0->sa_index;
+ next0 = IPSEC_INPUT_NEXT_ESP_DECRYPT;
+ vlib_buffer_advance(b0, header_size);
+ goto trace0;
+ }
+ }
+
+trace0:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ipsec_input_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = tunnel_index0;
+ tr->spi = clib_host_to_net_u32(esp0->spi);
+ tr->seq = clib_host_to_net_u32(esp0->seq);
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
+ n_left_to_next, bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, ipsec_input_ip6_node.index,
+ IPSEC_INPUT_ERROR_RX_PKTS,
+ from_frame->n_vectors);
+
+ return from_frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (ipsec_input_ip6_node) = {
+ .function = ipsec_input_ip6_node_fn,
+ .name = "ipsec-input-ip6",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ipsec_input_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ipsec_input_error_strings),
+ .error_strings = ipsec_input_error_strings,
+
+ .n_next_nodes = IPSEC_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [IPSEC_INPUT_NEXT_##s] = n,
+ foreach_ipsec_input_next
+#undef _
+ },
+};
diff --git a/vnet/vnet/ipsec/ipsec_output.c b/vnet/vnet/ipsec/ipsec_output.c
new file mode 100644
index 00000000000..77b39fa9ee4
--- /dev/null
+++ b/vnet/vnet/ipsec/ipsec_output.c
@@ -0,0 +1,405 @@
+/*
+ * ipsec_output.c : IPSec output node
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/api_errno.h>
+#include <vnet/ip/ip.h>
+
+#include <vnet/ipsec/ipsec.h>
+
+
+#define foreach_ipsec_output_next \
+_(DROP, "error-drop") \
+_(ESP_ENCRYPT, "esp-encrypt")
+
+#define _(v, s) IPSEC_OUTPUT_NEXT_##v,
+typedef enum {
+ foreach_intf_output_feat
+ foreach_ipsec_output_next
+#undef _
+ IPSEC_OUTPUT_N_NEXT,
+} ipsec_output_next_t;
+
+
+#define foreach_ipsec_output_error \
+ _(RX_PKTS, "IPSec pkts received") \
+ _(POLICY_DISCARD, "IPSec policy discard") \
+ _(POLICY_NO_MATCH, "IPSec policy (no match)") \
+ _(POLICY_PROTECT, "IPSec policy protect") \
+ _(POLICY_BYPASS, "IPSec policy bypass") \
+ _(ENCAPS_FAILED, "IPSec encapsulation failed")
+
+
+typedef enum {
+#define _(sym,str) IPSEC_OUTPUT_ERROR_##sym,
+ foreach_ipsec_output_error
+#undef _
+ IPSEC_DECAP_N_ERROR,
+} ipsec_output_error_t;
+
+static char * ipsec_output_error_strings[] = {
+#define _(sym,string) string,
+ foreach_ipsec_output_error
+#undef _
+};
+
+vlib_node_registration_t ipsec_output_node;
+
+typedef struct {
+ u32 spd_id;
+} ipsec_output_trace_t;
+
+/* packet trace format function */
+static u8 * format_ipsec_output_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ ipsec_output_trace_t * t = va_arg (*args, ipsec_output_trace_t *);
+
+ if (t->spd_id != ~0)
+ {
+ s = format (s, "spd %u ", t->spd_id);
+ }
+ else
+ {
+ s = format (s, "no spd");
+ }
+ return s;
+}
+
+always_inline intf_output_feat_t
+get_next_intf_output_feature_and_reset_bit(vlib_buffer_t *b)
+{
+ u32 next_feature;
+ count_trailing_zeros(next_feature, vnet_buffer(b)->output_features.bitmap);
+ if (next_feature != INTF_OUTPUT_FEAT_DONE)
+ vnet_buffer(b)->output_features.bitmap &= ~(1 << next_feature);
+ return next_feature;
+}
+
+always_inline ipsec_policy_t *
+ipsec_output_policy_match(ipsec_spd_t * spd, u8 pr, u32 la, u32 ra, u16 lp, u16 rp)
+{
+ ipsec_policy_t * p;
+ u32 * i;
+
+ vec_foreach(i, spd->ipv4_outbound_policies)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ if (PREDICT_FALSE(p->protocol && (p->protocol != pr)))
+ continue;
+
+ if (la < clib_net_to_host_u32(p->laddr.start.ip4.as_u32))
+ continue;
+
+ if (la > clib_net_to_host_u32(p->laddr.stop.ip4.as_u32))
+ continue;
+
+ if (ra < clib_net_to_host_u32(p->raddr.start.ip4.as_u32))
+ continue;
+
+ if (ra > clib_net_to_host_u32(p->raddr.stop.ip4.as_u32))
+ continue;
+
+ if (PREDICT_FALSE((pr != IP_PROTOCOL_TCP) && (pr != IP_PROTOCOL_UDP)))
+ return p;
+
+ if (lp < p->lport.start)
+ continue;
+
+ if (lp > p->lport.stop)
+ continue;
+
+ if (rp < p->rport.start)
+ continue;
+
+ if (rp > p->rport.stop)
+ continue;
+
+ return p;
+ }
+ return 0;
+}
+
+always_inline uword
+ip6_addr_match_range (ip6_address_t * a, ip6_address_t * la, ip6_address_t * ua)
+{
+ if ((memcmp(a->as_u64, la->as_u64, 2 * sizeof(u64)) >= 0) &&
+ (memcmp(a->as_u64, ua->as_u64, 2 * sizeof(u64)) <= 0))
+ return 1;
+ return 0;
+}
+
+always_inline ipsec_policy_t *
+ipsec_output_ip6_policy_match (ipsec_spd_t * spd,
+ ip6_address_t * sa,
+ ip6_address_t * da,
+ u16 lp,
+ u16 rp,
+ u8 pr)
+{
+ ipsec_policy_t * p;
+ u32 * i;
+
+ vec_foreach(i, spd->ipv6_outbound_policies)
+ {
+ p = pool_elt_at_index(spd->policies, *i);
+ if (PREDICT_FALSE(p->protocol && (p->protocol != pr)))
+ continue;
+
+ if (!ip6_addr_match_range(sa, &p->raddr.start.ip6, &p->raddr.stop.ip6))
+ continue;
+
+ if (!ip6_addr_match_range(da, &p->laddr.start.ip6, &p->laddr.stop.ip6))
+ continue;
+
+ if (PREDICT_FALSE((pr != IP_PROTOCOL_TCP) && (pr != IP_PROTOCOL_UDP)))
+ return p;
+
+ if (lp < p->lport.start)
+ continue;
+
+ if (lp > p->lport.stop)
+ continue;
+
+ if (rp < p->rport.start)
+ continue;
+
+ if (rp > p->rport.stop)
+ continue;
+
+ return p;
+ }
+
+ return 0;
+}
+static uword
+ipsec_output_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ipsec_main_t *im = &ipsec_main;
+ vnet_main_t * vnm = im->vnet_main;
+
+ u32 * from, * to_next = 0;
+ u32 n_left_from, sw_if_index0, last_sw_if_index = (u32) ~0;
+ u32 next_node_index = (u32)~0, last_next_node_index = (u32) ~0;
+ vlib_frame_t *f = 0;
+ u32 spd_index0 = ~0;
+ ipsec_spd_t * spd0 = 0;
+ u64 nc_protect = 0, nc_bypass = 0, nc_discard = 0, nc_nomatch = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ipsec_policy_t * p0;
+ ip4_header_t * ip0;
+ ip6_header_t * ip6_0 = 0;
+ udp_header_t * udp0;
+ u8 is_ipv6 = 0;
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+
+
+ ip0 = (ip4_header_t *) ((u8 *) vlib_buffer_get_current (b0) +
+ sizeof(ethernet_header_t));
+
+ /* just forward non ipv4 packets */
+ if (PREDICT_FALSE((ip0->ip_version_and_header_length & 0xF0 ) != 0x40))
+ {
+ /* ipv6 packets */
+ if (PREDICT_TRUE((ip0->ip_version_and_header_length & 0xF0 ) == 0x60))
+ {
+ is_ipv6 = 1;
+ ip6_0 = (ip6_header_t *) ((u8 *) vlib_buffer_get_current (b0) +
+ sizeof(ethernet_header_t));
+ }
+ else
+ {
+ next_node_index = get_next_output_feature_node_index(vnm, b0);
+ goto dispatch0;
+ }
+ }
+
+ /* lookup for SPD only if sw_if_index is changed */
+ if (PREDICT_FALSE(last_sw_if_index != sw_if_index0))
+ {
+ uword * p = hash_get (im->spd_index_by_sw_if_index, sw_if_index0);
+ ASSERT(p);
+ spd_index0 = p[0];
+ spd0 = pool_elt_at_index(im->spds, spd_index0);
+ last_sw_if_index = sw_if_index0;
+ }
+
+ if (is_ipv6)
+ {
+ udp0 = ip6_next_header(ip6_0);
+#if 0
+ clib_warning("packet received from %U port %u to %U port %u spd_id %u",
+ format_ip6_address, &ip6_0->src_address,
+ clib_net_to_host_u16(udp0->src_port),
+ format_ip6_address, &ip6_0->dst_address,
+ clib_net_to_host_u16(udp0->dst_port),
+ spd0->id);
+#endif
+
+ p0 = ipsec_output_ip6_policy_match(spd0,
+ &ip6_0->src_address,
+ &ip6_0->dst_address,
+ clib_net_to_host_u16(udp0->src_port),
+ clib_net_to_host_u16(udp0->dst_port),
+ ip6_0->protocol);
+ }
+ else
+ {
+ udp0 = (udp_header_t *) ((u8 *) ip0 + ip4_header_bytes (ip0));
+
+#if 0
+ clib_warning("packet received from %U to %U port %u",
+ format_ip4_address, ip0->src_address.as_u8,
+ format_ip4_address, ip0->dst_address.as_u8,
+ clib_net_to_host_u16(udp0->dst_port));
+ clib_warning("sw_if_index0 %u spd_index0 %u spd_id %u",
+ sw_if_index0, spd_index0, spd0->id);
+#endif
+
+ p0 = ipsec_output_policy_match(spd0, ip0->protocol,
+ clib_net_to_host_u32(ip0->src_address.as_u32),
+ clib_net_to_host_u32(ip0->dst_address.as_u32),
+ clib_net_to_host_u16(udp0->src_port),
+ clib_net_to_host_u16(udp0->dst_port));
+ }
+
+ if (PREDICT_TRUE(p0 != NULL))
+ {
+ if (p0->policy == IPSEC_POLICY_ACTION_PROTECT)
+ {
+ nc_protect++;
+ next_node_index = im->esp_encrypt_node_index;
+ vnet_buffer(b0)->output_features.ipsec_sad_index = p0->sa_index;
+ vlib_buffer_advance(b0, sizeof(ethernet_header_t));
+ p0->counter.packets++;
+ if (is_ipv6)
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip6_0->payload_length);
+ p0->counter.bytes += sizeof(ip6_header_t);
+ }
+ else
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip0->length);
+ }
+ }
+ else if (p0->policy == IPSEC_POLICY_ACTION_BYPASS)
+ {
+ nc_bypass++;
+ next_node_index = get_next_output_feature_node_index(vnm, b0);
+ p0->counter.packets++;
+ if (is_ipv6)
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip6_0->payload_length);
+ p0->counter.bytes += sizeof(ip6_header_t);
+ }
+ else
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip0->length);
+ }
+ }
+ else
+ {
+ nc_discard++;
+ p0->counter.packets++;
+ if (is_ipv6)
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip6_0->payload_length);
+ p0->counter.bytes += sizeof(ip6_header_t);
+ }
+ else
+ {
+ p0->counter.bytes += clib_net_to_host_u16(ip0->length);
+ }
+ next_node_index = im->error_drop_node_index;
+ }
+ }
+ else
+ {
+ nc_nomatch++;
+ next_node_index = im->error_drop_node_index;
+ }
+
+dispatch0:
+ from += 1;
+ n_left_from -= 1;
+
+ if (PREDICT_FALSE((last_next_node_index != next_node_index)))
+ {
+ /* if this is not 1st frame */
+ if (f)
+ vlib_put_frame_to_node (vm, last_next_node_index, f);
+
+ last_next_node_index = next_node_index;
+
+ f = vlib_get_frame_to_node(vm, next_node_index);
+ to_next = vlib_frame_vector_args (f);
+ }
+
+ to_next[0] = bi0;
+ to_next+=1;
+ f->n_vectors++;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) {
+ ipsec_output_trace_t *tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ if (spd0)
+ tr->spd_id = spd0->id;
+ }
+ }
+
+ vlib_put_frame_to_node (vm, next_node_index, f);
+ vlib_node_increment_counter (vm, ipsec_output_node.index,
+ IPSEC_OUTPUT_ERROR_POLICY_PROTECT, nc_protect);
+ vlib_node_increment_counter (vm, ipsec_output_node.index,
+ IPSEC_OUTPUT_ERROR_POLICY_BYPASS, nc_bypass);
+ vlib_node_increment_counter (vm, ipsec_output_node.index,
+ IPSEC_OUTPUT_ERROR_POLICY_DISCARD, nc_discard);
+ vlib_node_increment_counter (vm, ipsec_output_node.index,
+ IPSEC_OUTPUT_ERROR_POLICY_NO_MATCH, nc_nomatch);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (ipsec_output_node) = {
+ .function = ipsec_output_node_fn,
+ .name = "ipsec-output",
+ .vector_size = sizeof (u32),
+ .format_trace = format_ipsec_output_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(ipsec_output_error_strings),
+ .error_strings = ipsec_output_error_strings,
+
+ .n_next_nodes = IPSEC_OUTPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [IPSEC_OUTPUT_NEXT_##s] = n,
+ foreach_intf_output_feat
+ foreach_ipsec_output_next
+#undef _
+ },
+};
diff --git a/vnet/vnet/l2/feat_bitmap.c b/vnet/vnet/l2/feat_bitmap.c
new file mode 100644
index 00000000000..74917cda3ae
--- /dev/null
+++ b/vnet/vnet/l2/feat_bitmap.c
@@ -0,0 +1,166 @@
+/*
+ * feat_bitmap.c: bitmap for managing feature invocation
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+
+// Drop node for feature bitmaps
+// For features that just do a drop, or are not yet implemented.
+// Initial feature dispatch nodes don't need to set b0->error
+// in case of a possible drop because that will be done here.
+// The next node is always error-drop.
+
+
+static vlib_node_registration_t feat_bitmap_drop_node;
+
+#define foreach_feat_bitmap_drop_error \
+_(NO_FWD, "L2 feature forwarding disabled") \
+_(NYI, "L2 feature not implemented")
+
+typedef enum {
+#define _(sym,str) FEAT_BITMAP_DROP_ERROR_##sym,
+ foreach_feat_bitmap_drop_error
+#undef _
+ FEAT_BITMAP_DROP_N_ERROR,
+} feat_bitmap_drop_error_t;
+
+static char * feat_bitmap_drop_error_strings[] = {
+#define _(sym,string) string,
+ foreach_feat_bitmap_drop_error
+#undef _
+};
+
+typedef enum {
+ FEAT_BITMAP_DROP_NEXT_DROP,
+ FEAT_BITMAP_DROP_N_NEXT,
+} feat_bitmap_drop_next_t;
+
+typedef struct {
+ u32 feature_bitmap;
+} feat_bitmap_drop_trace_t;
+
+/* packet trace format function */
+static u8 * format_feat_bitmap_drop_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ feat_bitmap_drop_trace_t * t = va_arg (*args, feat_bitmap_drop_trace_t *);
+
+ s = format (s, "feat_bitmap_drop: feature bitmap 0x%08x", t->feature_bitmap);
+ return s;
+}
+
+static uword
+feat_bitmap_drop_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ feat_bitmap_drop_next_t next_index;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ feat_bitmap_drop_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->feature_bitmap = vnet_buffer(b0)->l2.feature_bitmap;
+ }
+
+ if (vnet_buffer(b0)->l2.feature_bitmap == 1) {
+ // If we are executing the last feature, this is the
+ // No forwarding catch-all
+ b0->error = node->errors[FEAT_BITMAP_DROP_ERROR_NO_FWD];
+ } else {
+ b0->error = node->errors[FEAT_BITMAP_DROP_ERROR_NYI];
+ }
+ next0 = FEAT_BITMAP_DROP_NEXT_DROP;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+clib_error_t *feat_bitmap_drop_init (vlib_main_t *vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (feat_bitmap_drop_init);
+
+VLIB_REGISTER_NODE (feat_bitmap_drop_node,static) = {
+ .function = feat_bitmap_drop_node_fn,
+ .name = "feature-bitmap-drop",
+ .vector_size = sizeof (u32),
+ .format_trace = format_feat_bitmap_drop_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(feat_bitmap_drop_error_strings),
+ .error_strings = feat_bitmap_drop_error_strings,
+
+ .n_next_nodes = FEAT_BITMAP_DROP_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [FEAT_BITMAP_DROP_NEXT_DROP] = "error-drop",
+ },
+};
+
+
diff --git a/vnet/vnet/l2/feat_bitmap.h b/vnet/vnet/l2/feat_bitmap.h
new file mode 100644
index 00000000000..7dd36a7712e
--- /dev/null
+++ b/vnet/vnet/l2/feat_bitmap.h
@@ -0,0 +1,80 @@
+/*
+ * feat_bitmap.h: bitmap for managing feature invocation
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_l2_feat_bitmap_h
+#define included_vnet_l2_feat_bitmap_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+/*
+ * The feature bitmap is a way of organizing input and output feature graph nodes.
+ * The set of features to be executed are arranged in a bitmap with one bit per
+ * feature and each bit positioned in the same order that the features should be
+ * executed. Features can be dynamically removed from the set by masking off their
+ * corresponding bits. The bitmap is stored in packet context. Each feature clears
+ * its bit and then calls feat_bitmap_get_next_node_index() to go to the next
+ * graph node.
+ */
+
+
+// 32 features in a u32 bitmap
+#define FEAT_MAX 32
+
+// Initialize the feature next-node indexes of a graph node.
+// Should be called by the init function of each feature graph node.
+always_inline
+void feat_bitmap_init_next_nodes (
+ vlib_main_t * vm,
+ u32 node_index, // the current graph node index
+ u32 num_features, // number of entries in feat_names
+ char ** feat_names, // array of feature graph node names
+ u32 * next_nodes) // array of 32 next indexes to init
+{
+ u32 idx;
+
+ ASSERT(num_features <= FEAT_MAX);
+
+ for (idx=0; idx<num_features; idx++) {
+ if (vlib_get_node_by_name(vm, (u8 *) feat_names[idx])) {
+ next_nodes[idx] =
+ vlib_node_add_named_next(vm, node_index, feat_names[idx]);
+ } else { // Node may be in plugin which is not installed, use drop node
+ next_nodes[idx] =
+ vlib_node_add_named_next(vm, node_index, "feature-bitmap-drop");
+ }
+ }
+
+ // All unassigned bits go to the drop node
+ for (; idx<FEAT_MAX; idx++) {
+ next_nodes[idx] = vlib_node_add_named_next(vm, node_index, "feature-bitmap-drop");
+ }
+}
+
+// Return the graph node index for the feature corresponding to the
+// first set bit in the bitmap.
+always_inline
+u32 feat_bitmap_get_next_node_index (u32 * next_nodes, u32 bitmap)
+{
+ u32 first_bit;
+
+ count_leading_zeros(first_bit, bitmap);
+ first_bit = uword_bits - 1 - first_bit;
+ return next_nodes[first_bit];
+}
+
+#endif // included_vnet_l2_feat_bitmap_h
diff --git a/vnet/vnet/l2/l2_bd.c b/vnet/vnet/l2/l2_bd.c
new file mode 100644
index 00000000000..24f96d5749c
--- /dev/null
+++ b/vnet/vnet/l2/l2_bd.c
@@ -0,0 +1,695 @@
+/*
+ * l2_bd.c : layer 2 bridge domain
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vlib/cli.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ip/format.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_bd.h>
+#include <vnet/l2/l2_fib.h>
+#include <vnet/l2/l2_vtr.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/vec.h>
+
+bd_main_t bd_main;
+
+// Init bridge domain if not done already
+// For feature bitmap, set all bits except ARP termination
+inline void
+bd_validate (l2_bridge_domain_t * bd_config)
+{
+ if (!bd_is_valid (bd_config)) {
+ bd_config->feature_bitmap = ~L2INPUT_FEAT_ARP_TERM;
+ bd_config->bvi_sw_if_index = ~0;
+ bd_config->members = 0;
+ bd_config->mac_by_ip4 = 0;
+// bd_config->mac_by_ip6 = hash_create_mem (0, sizeof(ip6_address_t),
+// sizeof(uword));
+ }
+}
+
+u32 bd_find_or_add_bd_index (bd_main_t * bdm, u32 bd_id)
+{
+ uword * p;
+ u32 rv;
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (p)
+ return (p[0]);
+
+ rv = clib_bitmap_first_clear (bdm->bd_index_bitmap);
+
+ // mark this index busy
+ bdm->bd_index_bitmap = clib_bitmap_set (bdm->bd_index_bitmap, rv, 1);
+
+ hash_set (bdm->bd_index_by_bd_id, bd_id, rv);
+
+ vec_validate (l2input_main.bd_configs, rv);
+ l2input_main.bd_configs[rv].bd_id = bd_id;
+
+ return rv;
+}
+
+int bd_delete_bd_index (bd_main_t * bdm, u32 bd_id)
+{
+ uword * p;
+ u32 bd_index;
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (p == 0)
+ return -1;
+
+ bd_index = p[0];
+
+ // mark this index clear
+ bdm->bd_index_bitmap = clib_bitmap_set (bdm->bd_index_bitmap, bd_index, 0);
+ hash_unset (bdm->bd_index_by_bd_id, bd_id);
+
+ l2input_main.bd_configs[bd_index].bd_id = ~0;
+ l2input_main.bd_configs[bd_index].feature_bitmap = 0;
+
+ return 0;
+}
+
+void
+bd_add_member (l2_bridge_domain_t * bd_config,
+ l2_flood_member_t * member)
+{
+ // Add one element to the vector
+
+ // When flooding, the bvi interface (if present) must be the last member
+ // processed due to how BVI processing can change the packet. To enable
+ // this order, we make the bvi interface the first in the vector and
+ // flooding walks the vector in reverse.
+ if ((member->flags == L2_FLOOD_MEMBER_NORMAL) ||
+ (vec_len(bd_config->members) == 0)) {
+ vec_add1 (bd_config->members, *member);
+
+ } else {
+ // Move 0th element to the end
+ vec_add1 (bd_config->members, bd_config->members[0]);
+ bd_config->members[0] = *member;
+ }
+}
+
+
+#define BD_REMOVE_ERROR_OK 0
+#define BD_REMOVE_ERROR_NOT_FOUND 1
+
+u32
+bd_remove_member (l2_bridge_domain_t * bd_config,
+ u32 sw_if_index)
+{
+ u32 ix;
+
+ // Find and delete the member
+ vec_foreach_index(ix, bd_config->members) {
+ if (vec_elt(bd_config->members, ix).sw_if_index == sw_if_index) {
+ vec_del1 (bd_config->members, ix);
+ return BD_REMOVE_ERROR_OK;
+ }
+ }
+
+ return BD_REMOVE_ERROR_NOT_FOUND;
+}
+
+
+clib_error_t *l2bd_init (vlib_main_t *vm)
+{
+ bd_main_t *bdm = &bd_main;
+ u32 bd_index;
+ bdm->bd_index_by_bd_id = hash_create (0, sizeof(uword));
+ // create a dummy bd with bd_id of 0 and bd_index of 0 with feature set
+ // to packet drop only. Thus, packets received from any L2 interface with
+ // uninitialized bd_index of 0 can be dropped safely.
+ bd_index = bd_find_or_add_bd_index (bdm, 0);
+ ASSERT (bd_index == 0);
+ l2input_main.bd_configs[0].feature_bitmap = L2INPUT_FEAT_DROP;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2bd_init);
+
+
+// Set the learn/forward/flood flags for the bridge domain
+// Return 0 if ok, non-zero if for an error.
+u32
+bd_set_flags (vlib_main_t * vm,
+ u32 bd_index,
+ u32 flags,
+ u32 enable) {
+
+ l2_bridge_domain_t * bd_config;
+ u32 feature_bitmap = 0;
+
+ vec_validate (l2input_main.bd_configs, bd_index);
+ bd_config = vec_elt_at_index(l2input_main.bd_configs, bd_index);
+
+ bd_validate (bd_config);
+
+ if (flags & L2_LEARN) {
+ feature_bitmap |= L2INPUT_FEAT_LEARN;
+ }
+ if (flags & L2_FWD) {
+ feature_bitmap |= L2INPUT_FEAT_FWD;
+ }
+ if (flags & L2_FLOOD) {
+ feature_bitmap |= L2INPUT_FEAT_FLOOD;
+ }
+ if (flags & L2_UU_FLOOD) {
+ feature_bitmap |= L2INPUT_FEAT_UU_FLOOD;
+ }
+ if (flags & L2_ARP_TERM) {
+ feature_bitmap |= L2INPUT_FEAT_ARP_TERM;
+ }
+
+ if (enable) {
+ bd_config->feature_bitmap |= feature_bitmap;
+ } else {
+ bd_config->feature_bitmap &= ~feature_bitmap;
+ }
+
+ return 0;
+}
+
+// set bridge-domain learn enable/disable
+// The CLI format is:
+// set bridge-domain learn <bd_id> [disable]
+static clib_error_t *
+bd_learn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 enable;
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id))
+ {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+
+ if (p == 0)
+ return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ bd_index = p[0];
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the bridge domain flag
+ if (bd_set_flags(vm, bd_index, L2_LEARN, enable)) {
+ error = clib_error_return (0, "bridge-domain id %d out of range", bd_index);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_learn_cli, static) = {
+ .path = "set bridge-domain learn",
+ .short_help = "set bridge-domain learn <bridge-domain-id> [disable]",
+ .function = bd_learn,
+};
+
+// set bridge-domain forward enable/disable
+// The CLI format is:
+// set bridge-domain forward <bd_index> [disable]
+static clib_error_t *
+bd_fwd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 enable;
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id))
+ {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+
+ if (p == 0)
+ return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ bd_index = p[0];
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the bridge domain flag
+ if (bd_set_flags(vm, bd_index, L2_FWD, enable)) {
+ error = clib_error_return (0, "bridge-domain id %d out of range", bd_index);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_fwd_cli, static) = {
+ .path = "set bridge-domain forward",
+ .short_help = "set bridge-domain forward <bridge-domain-id> [disable]",
+ .function = bd_fwd,
+};
+
+// set bridge-domain flood enable/disable
+// The CLI format is:
+// set bridge-domain flood <bd_index> [disable]
+static clib_error_t *
+bd_flood (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 enable;
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id))
+ {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+
+ if (p == 0)
+ return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ bd_index = p[0];
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the bridge domain flag
+ if (bd_set_flags(vm, bd_index, L2_FLOOD, enable)) {
+ error = clib_error_return (0, "bridge-domain id %d out of range", bd_index);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_flood_cli, static) = {
+ .path = "set bridge-domain flood",
+ .short_help = "set bridge-domain flood <bridge-domain-id> [disable]",
+ .function = bd_flood,
+};
+
+// set bridge-domain unkown-unicast flood enable/disable
+// The CLI format is:
+// set bridge-domain uu-flood <bd_index> [disable]
+static clib_error_t *
+bd_uu_flood (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 enable;
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id))
+ {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+
+ if (p == 0)
+ return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ bd_index = p[0];
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the bridge domain flag
+ if (bd_set_flags(vm, bd_index, L2_UU_FLOOD, enable)) {
+ error = clib_error_return (0, "bridge-domain id %d out of range", bd_index);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_uu_flood_cli, static) = {
+ .path = "set bridge-domain uu-flood",
+ .short_help = "set bridge-domain uu-flood <bridge-domain-id> [disable]",
+ .function = bd_uu_flood,
+};
+
+// set bridge-domain arp term enable/disable
+// The CLI format is:
+// set bridge-domain arp term <bridge-domain-id> [disable]
+static clib_error_t *
+bd_arp_term (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 enable;
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id)) {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (p) bd_index = *p;
+ else return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ enable = 1;
+ if (unformat (input, "disable")) enable = 0;
+
+ // set the bridge domain flag
+ if (bd_set_flags(vm, bd_index, L2_ARP_TERM, enable)) {
+ error = clib_error_return (0, "bridge-domain id %d out of range", bd_index);
+ goto done;
+ }
+
+done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_arp_term_cli, static) = {
+ .path = "set bridge-domain arp term",
+ .short_help = "set bridge-domain arp term <bridge-domain-id> [disable]",
+ .function = bd_arp_term,
+};
+
+
+// The clib hash implementation stores uword entries in the hash table.
+// The hash table mac_by_ip4 is keyed via IP4 address and store the
+// 6-byte MAC address directly in the hash table entry uword.
+// This only works for 64-bit processor with 8-byte uword; which means
+// this code *WILL NOT WORK* for a 32-bit prcessor with 4-byte uword.
+u32 bd_add_del_ip_mac(u32 bd_index,
+ u8 *ip_addr,
+ u8 *mac_addr,
+ u8 is_ip6,
+ u8 is_add)
+{
+ l2input_main_t * l2im = &l2input_main;
+ l2_bridge_domain_t * bd_cfg = l2input_bd_config_from_index (l2im, bd_index);
+ u64 new_mac = *(u64 *) mac_addr;
+ u64 * old_mac;
+ u16 * mac16 = (u16 *) &new_mac;
+
+ ASSERT (sizeof(uword) == sizeof(u64)); // make sure uword is 8 bytes
+
+ mac16[3] = 0; // Clear last 2 unsed bytes of the 8-byte MAC address
+ if (is_ip6) {
+ // ip6_address_t ip6_addr = *(ip6_address_t *) ip_addr;
+ return 1; // not yet implemented
+ } else {
+ ip4_address_t ip4_addr = *(ip4_address_t *) ip_addr;
+ old_mac = (u64 *) hash_get (bd_cfg->mac_by_ip4, ip4_addr.as_u32);
+ if (is_add) {
+ if (old_mac && (*old_mac == new_mac)) return 0; // mac entry already exist
+ hash_set (bd_cfg->mac_by_ip4, ip4_addr.as_u32, new_mac);
+ } else {
+ if (old_mac && (*old_mac == new_mac)) { // mac entry match
+ hash_unset (bd_cfg->mac_by_ip4, ip4_addr.as_u32); // clear entry
+ } else {
+ return 1;
+ }
+ }
+ return 0;
+ }
+}
+
+// set bridge-domain arp entry add/delete
+// The CLI format is:
+// set bridge-domain arp entry <bd-id> <ip-addr> <mac-addr> [del]
+static clib_error_t *
+bd_arp_entry (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u8 is_add = 1;
+ u8 is_ip6 = 0;
+ u8 ip_addr[16];
+ u8 mac_addr[6];
+ uword * p;
+
+ if (! unformat (input, "%d", &bd_id)) {
+ error = clib_error_return (0, "expecting bridge-domain id but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+
+ if (p) bd_index = *p;
+ else return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ if (unformat (input, "%U", unformat_ip4_address, ip_addr)) {
+ is_ip6 = 0;
+ } else if (unformat (input, "%U", unformat_ip6_address, ip_addr)) {
+ is_ip6 = 1;
+ } else {
+ error = clib_error_return (0, "expecting IP address but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (!unformat(input, "%U", unformat_ethernet_address, mac_addr)) {
+ error = clib_error_return (0, "expecting MAC address but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (unformat (input, "del")) {
+ is_add = 0;
+ }
+
+ // set the bridge domain flagAdd IP-MAC entry into bridge domain
+ if (bd_add_del_ip_mac(bd_index, ip_addr, mac_addr, is_ip6, is_add)) {
+ error = clib_error_return (0, "MAC %s for IP %U and MAC %U failed",
+ is_add ? "add" : "del",
+ format_ip4_address, ip_addr,
+ format_ethernet_address, mac_addr);
+ }
+
+done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_arp_entry_cli, static) = {
+ .path = "set bridge-domain arp entry",
+ .short_help = "set bridge-domain arp entry <bd-id> <ip-addr> <mac-addr> [del]",
+ .function = bd_arp_entry,
+};
+
+u8* format_vtr(u8 * s, va_list *args)
+{
+ u32 vtr_op = va_arg (*args, u32);
+ u32 dot1q = va_arg (*args, u32);
+ u32 tag1 = va_arg (*args, u32);
+ u32 tag2 = va_arg (*args, u32);
+ switch (vtr_op) {
+ case L2_VTR_DISABLED:
+ return format (s, "none");
+ case L2_VTR_PUSH_1:
+ return format (s, "push-1 %s %d", dot1q? "dot1q":"dot1ad", tag1);
+ case L2_VTR_PUSH_2:
+ return format (s, "push-2 %s %d %d", dot1q? "dot1q":"dot1ad", tag1, tag2);
+ case L2_VTR_POP_1:
+ return format (s, "pop-1");
+ case L2_VTR_POP_2:
+ return format (s, "pop-2");
+ case L2_VTR_TRANSLATE_1_1:
+ return format (s, "trans-1-1 %s %d", dot1q? "dot1q":"dot1ad", tag1);
+ case L2_VTR_TRANSLATE_1_2:
+ return format (s, "trans-1-2 %s %d %d",dot1q? "dot1q":"dot1ad", tag1, tag2);
+ case L2_VTR_TRANSLATE_2_1:
+ return format (s, "trans-2-1 %s %d", dot1q? "dot1q":"dot1ad", tag1);
+ case L2_VTR_TRANSLATE_2_2:
+ return format (s, "trans-2-2 %s %d %d", dot1q? "dot1q":"dot1ad", tag1, tag2);
+ default:
+ return format (s, "none");
+ }
+}
+
+// show bridge-domain state
+// The CLI format is:
+// show bridge-domain [<bd_index>]
+static clib_error_t *
+bd_show (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u32 bd_index = ~0;
+ l2_bridge_domain_t * bd_config;
+ u32 start, end;
+ u32 printed;
+ u32 detail = 0;
+ u32 intf = 0;
+ u32 arp = 0;
+ u32 bd_id = ~0;
+ uword * p;
+
+ start = 0;
+ end = vec_len(l2input_main.bd_configs);
+
+ if (unformat (input, "%d", &bd_id)) {
+ if (unformat (input, "detail")) detail = 1;
+ else if (unformat (input, "det")) detail = 1;
+ if (unformat (input, "int")) intf = 1;
+ if (unformat (input, "arp")) arp = 1;
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (p) bd_index = *p;
+ else return clib_error_return (0, "No such bridge domain %d", bd_id);
+
+ vec_validate (l2input_main.bd_configs, bd_index);
+ bd_config = vec_elt_at_index(l2input_main.bd_configs, bd_index);
+ if (bd_is_valid (bd_config)) {
+ start = bd_index;
+ end = start + 1;
+ } else {
+ vlib_cli_output (vm, "bridge-domain %d not in use", bd_id);
+ goto done;
+ }
+ }
+
+ // Show all bridge-domains that have been initialized
+
+ printed = 0;
+ for (bd_index=start; bd_index<end; bd_index++) {
+ bd_config = vec_elt_at_index(l2input_main.bd_configs, bd_index);
+ if (bd_is_valid(bd_config)) {
+ if (!printed) {
+ printed = 1;
+ vlib_cli_output (vm, "%=5s %=7s %=10s %=10s %=10s %=10s %=10s %=14s",
+ "ID",
+ "Index",
+ "Learning",
+ "U-Forwrd",
+ "UU-Flood",
+ "Flooding",
+ "ARP-Term",
+ "BVI-Intf");
+ }
+
+ vlib_cli_output (
+ vm, "%=5d %=7d %=10s %=10s %=10s %=10s %=10s %=14U",
+ bd_config->bd_id, bd_index,
+ bd_config->feature_bitmap & L2INPUT_FEAT_LEARN ? "on" : "off",
+ bd_config->feature_bitmap & L2INPUT_FEAT_FWD ? "on" : "off",
+ bd_config->feature_bitmap & L2INPUT_FEAT_UU_FLOOD ? "on" : "off",
+ bd_config->feature_bitmap & L2INPUT_FEAT_FLOOD ? "on" : "off",
+ bd_config->feature_bitmap & L2INPUT_FEAT_ARP_TERM ? "on" : "off",
+ format_vnet_sw_if_index_name_with_NA, vnm, bd_config->bvi_sw_if_index);
+
+ if (detail || intf) {
+ // Show all member interfaces
+
+ l2_flood_member_t * member;
+ u32 header = 0;
+
+ vec_foreach(member, bd_config->members) {
+ u32 vtr_opr, dot1q, tag1, tag2;
+ if (!header) {
+ header = 1;
+ vlib_cli_output (vm, "\n%=30s%=7s%=5s%=5s%=30s",
+ "Interface", "Index", "SHG", "BVI","VLAN-Tag-Rewrite");
+ }
+ l2vtr_get(vm, vnm, member->sw_if_index, &vtr_opr, &dot1q, &tag1, &tag2);
+ vlib_cli_output (vm, "%=30U%=7d%=5d%=5s%=30U",
+ format_vnet_sw_if_index_name, vnm, member->sw_if_index,
+ member->sw_if_index,
+ member->shg,
+ member->flags & L2_FLOOD_MEMBER_BVI ? "*" : "-",
+ format_vtr, vtr_opr, dot1q, tag1, tag2);
+ }
+ }
+
+ if ((detail || arp) &&
+ (bd_config->feature_bitmap & L2INPUT_FEAT_ARP_TERM)) {
+ u32 ip4_addr;
+ u64 mac_addr;
+ vlib_cli_output (vm, "\n IP4 to MAC table for ARP Termination");
+ hash_foreach (ip4_addr, mac_addr, bd_config->mac_by_ip4, ({
+ vlib_cli_output (vm, "%=20U => %=20U",
+ format_ip4_address, &ip4_addr,
+ format_ethernet_address, &mac_addr);
+ }));
+ }
+ }
+ }
+
+ if (!printed) {
+ vlib_cli_output (vm, "no bridge-domains in use");
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (bd_show_cli, static) = {
+ .path = "show bridge-domain",
+ .short_help = "show bridge-domain [bridge-domain-id [detail|int|arp]]",
+ .function = bd_show,
+};
diff --git a/vnet/vnet/l2/l2_bd.h b/vnet/vnet/l2/l2_bd.h
new file mode 100644
index 00000000000..9d29a83b22f
--- /dev/null
+++ b/vnet/vnet/l2/l2_bd.h
@@ -0,0 +1,120 @@
+/*
+ * l2_bd.h : layer 2 bridge domain
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2bd_h
+#define included_l2bd_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+typedef struct {
+ // hash bd_id -> bd_index
+ uword * bd_index_by_bd_id;
+
+ // Busy bd_index bitmap
+ uword * bd_index_bitmap;
+
+ // convenience
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} bd_main_t;
+
+bd_main_t bd_main;
+
+// Bridge domain member
+
+#define L2_FLOOD_MEMBER_NORMAL 0
+#define L2_FLOOD_MEMBER_BVI 1
+
+typedef struct {
+ u32 sw_if_index; // the output L2 interface
+ u8 flags; // 0=normal, 1=bvi
+ u8 shg; // split horizon group number
+ u16 spare;
+} l2_flood_member_t;
+
+
+// Per-bridge domain configuration
+
+typedef struct {
+ u32 feature_bitmap;
+ // Contains bit enables for flooding, learning, and forwarding.
+ // All other feature bits should always be set.
+
+ // identity of the bridge-domain's BVI interface
+ // set to ~0 if there is no BVI
+ u32 bvi_sw_if_index;
+
+ // output node index for bvi interface before it was changed to l2-input
+ u32 saved_bvi_output_node_index;
+
+ // bridge domain id, not to be confused with bd_index
+ u32 bd_id;
+
+ // Vector of members in the replication group
+ l2_flood_member_t * members;
+
+ // hash ip4/ip6 -> mac for arp termination
+ uword *mac_by_ip4;
+ uword *mac_by_ip6;
+
+} l2_bridge_domain_t;
+
+// Return 1 if bridge domain has been initialized
+always_inline u32
+bd_is_valid (l2_bridge_domain_t * bd_config)
+{
+ return (bd_config->feature_bitmap != 0);
+}
+
+// Init bridge domain if not done already
+inline void
+bd_validate (l2_bridge_domain_t * bd_config);
+
+
+void
+bd_add_member (l2_bridge_domain_t * bd_config,
+ l2_flood_member_t * member);
+
+u32
+bd_remove_member (l2_bridge_domain_t * bd_config,
+ u32 sw_if_index);
+
+
+#define L2_LEARN (1<<0)
+#define L2_FWD (1<<1)
+#define L2_FLOOD (1<<2)
+#define L2_UU_FLOOD (1<<3)
+#define L2_ARP_TERM (1<<4)
+
+u32
+bd_set_flags (vlib_main_t * vm,
+ u32 bd_index,
+ u32 flags,
+ u32 enable);
+
+u32 bd_find_or_add_bd_index (bd_main_t * bdm, u32 bd_id);
+int bd_delete_bd_index (bd_main_t * bdm, u32 bd_id);
+
+u32 bd_add_del_ip_mac(u32 bd_index,
+ u8 *ip_addr,
+ u8 *mac_addr,
+ u8 is_ip6,
+ u8 is_add);
+
+#endif
+
diff --git a/vnet/vnet/l2/l2_bvi.c b/vnet/vnet/l2/l2_bvi.c
new file mode 100644
index 00000000000..828e955617b
--- /dev/null
+++ b/vnet/vnet/l2/l2_bvi.c
@@ -0,0 +1,35 @@
+/*
+ * l2_bvi.c : layer 2 Bridged Virtual Interface
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/l2/l2_fwd.h>
+#include <vnet/l2/l2_flood.h>
+#include <vnet/l2/l2_bvi.h>
+
+
+// Call the L2 nodes that need the ethertype mapping
+void
+l2bvi_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index)
+{
+ l2fwd_register_input_type (vm, type, node_index);
+ l2flood_register_input_type (vm, type, node_index);
+}
+
+
diff --git a/vnet/vnet/l2/l2_bvi.h b/vnet/vnet/l2/l2_bvi.h
new file mode 100644
index 00000000000..ca5673373fb
--- /dev/null
+++ b/vnet/vnet/l2/l2_bvi.h
@@ -0,0 +1,122 @@
+/*
+ * l2_bvi.h : layer 2 Bridged Virtual Interface
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2bvi_h
+#define included_l2bvi_h
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/sparse_vec.h>
+
+#include <vnet/l2/l2_input.h>
+
+#define TO_BVI_ERR_OK 0
+#define TO_BVI_ERR_TAGGED 1
+#define TO_BVI_ERR_ETHERTYPE 2
+
+// Send a packet from L2 processing to L3 via the BVI interface.
+// Set next0 to the proper L3 input node.
+// Return an error if the packet isn't what we expect.
+
+static_always_inline u32
+l2_to_bvi (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ vlib_buffer_t * b0,
+ u32 bvi_sw_if_index,
+ next_by_ethertype_t * l3_next,
+ u32 * next0)
+{
+ u8 l2_len;
+ u16 ethertype;
+ u8 * l3h;
+
+ // Save L2 header position which may be changed due to packet replication
+ vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data;
+
+ // Strip L2 header
+ l2_len = vnet_buffer(b0)->l2.l2_len;
+ vlib_buffer_advance (b0, l2_len);
+
+ l3h = vlib_buffer_get_current (b0);
+ ethertype = clib_net_to_host_u16(*(u16 *)(l3h - 2));
+
+ // Set the input interface to be the BVI interface
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = bvi_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = ~0;
+
+ // Go to appropriate L3 input node
+ if (ethertype == ETHERNET_TYPE_IP4) {
+ *next0 = l3_next->input_next_ip4;
+ } else if (ethertype == ETHERNET_TYPE_IP6) {
+ *next0 = l3_next->input_next_ip6;
+ } else {
+ // uncommon ethertype, check table
+ u32 i0;
+
+ i0 = sparse_vec_index (l3_next->input_next_by_type, ethertype);
+ *next0 = vec_elt (l3_next->input_next_by_type, i0);
+
+ if (i0 == SPARSE_VEC_INVALID_INDEX) {
+ return TO_BVI_ERR_ETHERTYPE;
+ }
+ }
+
+ // increment BVI RX interface stat
+ vlib_increment_combined_counter
+ (vnet_main->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ vlib_main->cpu_index,
+ vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ 1,
+ vlib_buffer_length_in_chain (vlib_main, b0));
+ return TO_BVI_ERR_OK;
+}
+
+
+// Prepare a packet that was sent to the BVI interface for L2 processing.
+
+static_always_inline void
+bvi_to_l2 (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 cpu_index,
+ vlib_buffer_t * b0,
+ u32 bvi_sw_if_index)
+{
+ // Set the input interface to be the BVI interface
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = bvi_sw_if_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = ~0;
+
+ // Update l2_len in packet which is expected by l2 path,
+ // including l2 tag push/pop code on output
+ vnet_update_l2_len(b0);
+
+ // increment BVI TX interface stat
+ vlib_increment_combined_counter
+ (vnet_main->interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_TX,
+ cpu_index,
+ bvi_sw_if_index,
+ 1,
+ vlib_buffer_length_in_chain (vlib_main, b0));
+}
+
+
+void
+l2bvi_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index);
+#endif
diff --git a/vnet/vnet/l2/l2_classify.c b/vnet/vnet/l2/l2_classify.c
new file mode 100644
index 00000000000..a6c8ebbc1b4
--- /dev/null
+++ b/vnet/vnet/l2/l2_classify.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * l2_classify.c
+ */
+
+#include <vnet/l2/l2_classify.h>
+#include <vnet/api_errno.h>
+
+typedef struct {
+ /* per-pkt trace data */
+ u32 sw_if_index;
+ u32 next_index;
+ u32 table_index;
+ u32 session_offset;
+} l2_classify_trace_t;
+
+typedef struct {
+ vnet_classify_main_t * vcm;
+ l2_classify_main_t * l2cm;
+} l2_classify_runtime_t;
+
+/* packet trace format function */
+static u8 * format_l2_classify_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_classify_trace_t * t = va_arg (*args, l2_classify_trace_t *);
+
+ s = format (s, "l2-classify: sw_if_index %d, table %d, offset %x, next %d",
+ t->sw_if_index, t->table_index, t->session_offset, t->next_index);
+ return s;
+}
+
+l2_classify_main_t l2_classify_main;
+
+vlib_node_registration_t l2_classify_node;
+
+#define foreach_l2_classify_error \
+_(MISS, "Classify misses") \
+_(HIT, "Classify hits") \
+_(CHAIN_HIT, "Classify hits after chain walk") \
+_(DROP, "L2 Classify Drops")
+
+typedef enum {
+#define _(sym,str) L2_CLASSIFY_ERROR_##sym,
+ foreach_l2_classify_error
+#undef _
+ L2_CLASSIFY_N_ERROR,
+} l2_classify_error_t;
+
+static char * l2_classify_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_classify_error
+#undef _
+};
+
+static uword
+l2_classify_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_classify_next_t next_index;
+ l2_classify_main_t * cm = &l2_classify_main;
+ vnet_classify_main_t * vcm = cm->vnet_classify_main;
+ l2_classify_runtime_t * rt = (l2_classify_runtime_t *)node->runtime_data;
+ u32 feature_bitmap;
+ u32 hits = 0;
+ u32 misses = 0;
+ u32 chain_hits = 0;
+ f64 now;
+
+ now = vlib_time_now(vm);
+
+ n_left_from = frame->n_vectors;
+ from = vlib_frame_vector_args (frame);
+
+ /* First pass: compute hash */
+
+ while (n_left_from > 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, bi1;
+ ethernet_header_t * h0, * h1;
+ u32 sw_if_index0, sw_if_index1;
+ u16 type0, type1;
+ int type_index0, type_index1;
+ vnet_classify_table_t * t0, * t1;
+ u32 table_index0, table_index1;
+ u64 hash0, hash1;
+
+
+ /* prefetch next iteration */
+ {
+ vlib_buffer_t * p1, * p2;
+
+ p1 = vlib_get_buffer (vm, from[1]);
+ p2 = vlib_get_buffer (vm, from[2]);
+
+ vlib_prefetch_buffer_header (p1, STORE);
+ CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE);
+ vlib_prefetch_buffer_header (p2, STORE);
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ bi1 = from[1];
+ b1 = vlib_get_buffer (vm, bi1);
+ h1 = vlib_buffer_get_current (b1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ vnet_buffer(b0)->l2_classify.table_index = ~0;
+
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ vnet_buffer(b1)->l2_classify.table_index = ~0;
+
+ /* Select classifier table based on ethertype */
+ type0 = clib_net_to_host_u16 (h0->type);
+ type1 = clib_net_to_host_u16 (h1->type);
+
+ type_index0 = (type0 == ETHERNET_TYPE_IP4)
+ ? L2_CLASSIFY_TABLE_IP4 : L2_CLASSIFY_TABLE_OTHER;
+ type_index0 = (type0 == ETHERNET_TYPE_IP6)
+ ? L2_CLASSIFY_TABLE_IP6 : type_index0;
+
+ type_index1 = (type1 == ETHERNET_TYPE_IP4)
+ ? L2_CLASSIFY_TABLE_IP4 : L2_CLASSIFY_TABLE_OTHER;
+ type_index1 = (type1 == ETHERNET_TYPE_IP6)
+ ? L2_CLASSIFY_TABLE_IP6 : type_index1;
+
+ vnet_buffer(b0)->l2_classify.table_index =
+ table_index0 =
+ rt->l2cm->classify_table_index_by_sw_if_index
+ [type_index0][sw_if_index0];
+
+ if (table_index0 != ~0)
+ {
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ vnet_buffer(b0)->l2_classify.hash = hash0 =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+ vnet_classify_prefetch_bucket (t0, hash0);
+ }
+
+ vnet_buffer(b1)->l2_classify.table_index =
+ table_index1 =
+ rt->l2cm->classify_table_index_by_sw_if_index
+ [type_index1][sw_if_index1];
+
+ if (table_index1 != ~0)
+ {
+ t1 = pool_elt_at_index (vcm->tables, table_index1);
+
+ vnet_buffer(b1)->l2_classify.hash = hash1 =
+ vnet_classify_hash_packet (t1, (u8 *) h1);
+ vnet_classify_prefetch_bucket (t1, hash1);
+ }
+
+ from += 2;
+ n_left_from -= 2;
+ }
+
+ while (n_left_from > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0;
+ ethernet_header_t * h0;
+ u32 sw_if_index0;
+ u16 type0;
+ u32 type_index0;
+ vnet_classify_table_t * t0;
+ u32 table_index0;
+ u64 hash0;
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ vnet_buffer(b0)->l2_classify.table_index = ~0;
+
+ /* Select classifier table based on ethertype */
+ type0 = clib_net_to_host_u16 (h0->type);
+
+ type_index0 = (type0 == ETHERNET_TYPE_IP4)
+ ? L2_CLASSIFY_TABLE_IP4 : L2_CLASSIFY_TABLE_OTHER;
+ type_index0 = (type0 == ETHERNET_TYPE_IP6)
+ ? L2_CLASSIFY_TABLE_IP6 : type_index0;
+
+ vnet_buffer(b0)->l2_classify.table_index =
+ table_index0 = rt->l2cm->classify_table_index_by_sw_if_index
+ [type_index0][sw_if_index0];
+
+ if (table_index0 != ~0)
+ {
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ vnet_buffer(b0)->l2_classify.hash = hash0 =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+ vnet_classify_prefetch_bucket (t0, hash0);
+ }
+ from++;
+ n_left_from--;
+ }
+
+ next_index = node->cached_next_index;
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ /* Not enough load/store slots to dual loop... */
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = L2_CLASSIFY_NEXT_ETHERNET_INPUT;
+ ethernet_header_t * h0;
+ u32 table_index0;
+ u64 hash0;
+ vnet_classify_table_t * t0;
+ vnet_classify_entry_t * e0;
+
+ if (PREDICT_TRUE (n_left_from > 2))
+ {
+ vlib_buffer_t * p2 = vlib_get_buffer(vm, from[2]);
+ u64 phash2;
+ u32 table_index2;
+ vnet_classify_table_t * tp2;
+
+ /*
+ * Prefetch table entry two ahead. Buffer / data
+ * were prefetched above...
+ */
+ table_index2 = vnet_buffer(p2)->l2_classify.table_index;
+
+ if (PREDICT_TRUE (table_index2 != ~0))
+ {
+ tp2 = pool_elt_at_index (vcm->tables, table_index2);
+ phash2 = vnet_buffer(p2)->l2_classify.hash;
+ vnet_classify_prefetch_entry (tp2, phash2);
+ }
+ }
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current(b0);
+ table_index0 = vnet_buffer(b0)->l2_classify.table_index;
+ e0 = 0;
+
+ if (PREDICT_TRUE(table_index0 != ~0))
+ {
+ hash0 = vnet_buffer(b0)->l2_classify.hash;
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ e0 = vnet_classify_find_entry (t0, (u8 *) h0,
+ hash0, now);
+ if (e0)
+ {
+ vnet_buffer(b0)->l2_classify.opaque_index
+ = e0->opaque_index;
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < L2_CLASSIFY_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ }
+ else
+ {
+ while (1)
+ {
+ if (t0->next_table_index != ~0)
+ t0 = pool_elt_at_index (vcm->tables,
+ t0->next_table_index);
+ else
+ {
+ next0 = (t0->miss_next_index < L2_CLASSIFY_N_NEXT)?
+ t0->miss_next_index:next0;
+ misses++;
+ break;
+ }
+
+ hash0 = vnet_classify_hash_packet (t0, (u8 *) h0);
+ e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0, now);
+ if (e0)
+ {
+ vnet_buffer(b0)->l2_classify.opaque_index
+ = e0->opaque_index;
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < L2_CLASSIFY_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ chain_hits++;
+ break;
+ }
+ }
+ }
+ }
+
+ if (PREDICT_FALSE(next0 == 0))
+ b0->error = node->errors[L2_CLASSIFY_ERROR_DROP];
+
+ if (PREDICT_FALSE (next0 == ~0))
+ {
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap = vnet_buffer(b0)->l2.feature_bitmap
+ & ~L2INPUT_FEAT_CLASSIFY;
+
+ // save for next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap;
+
+ // Determine the next node
+ next0 = feat_bitmap_get_next_node_index(cm->feat_next_node_index,
+ feature_bitmap);
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2_classify_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->table_index = table_index0;
+ t->next_index = next0;
+ t->session_offset = e0 ? vnet_classify_get_offset (t0, e0) : 0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_CLASSIFY_ERROR_MISS,
+ misses);
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_CLASSIFY_ERROR_HIT,
+ hits);
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_CLASSIFY_ERROR_CHAIN_HIT,
+ chain_hits);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (l2_classify_node) = {
+ .function = l2_classify_node_fn,
+ .name = "l2-classify",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_classify_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_classify_error_strings),
+ .error_strings = l2_classify_error_strings,
+
+ .runtime_data_bytes = sizeof (l2_classify_runtime_t),
+
+ .n_next_nodes = L2_CLASSIFY_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_CLASSIFY_NEXT_DROP] = "error-drop",
+ [L2_CLASSIFY_NEXT_ETHERNET_INPUT] = "ethernet-input-not-l2",
+ [L2_CLASSIFY_NEXT_IP4_INPUT] = "ip4-input",
+ [L2_CLASSIFY_NEXT_IP6_INPUT] = "ip6-input",
+ [L2_CLASSIFY_NEXT_LI] = "li-hit",
+ },
+};
+
+clib_error_t *l2_classify_init (vlib_main_t *vm)
+{
+ l2_classify_main_t * cm = &l2_classify_main;
+ l2_classify_runtime_t * rt;
+
+ rt = vlib_node_get_runtime_data (vm, l2_classify_node.index);
+
+ cm->vlib_main = vm;
+ cm->vnet_main = vnet_get_main();
+ cm->vnet_classify_main = &vnet_classify_main;
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2_classify_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ cm->feat_next_node_index);
+ rt->l2cm = cm;
+ rt->vcm = cm->vnet_classify_main;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_classify_init);
+
+
+void vnet_l2_classify_enable_disable (u32 sw_if_index,
+ int enable_disable)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vnet_main_t * vnm = vnet_get_main();
+
+ if (enable_disable)
+ set_int_l2_mode (vm, vnm, MODE_L2_CLASSIFY, sw_if_index,
+ 0, 0, 0, 0);
+ else
+ set_int_l2_mode (vm, vnm, MODE_L3, sw_if_index,
+ 0, 0, 0, 0);
+}
+
+int vnet_l2_classify_set_tables (u32 sw_if_index,
+ u32 ip4_table_index,
+ u32 ip6_table_index,
+ u32 other_table_index)
+{
+ l2_classify_main_t * cm = &l2_classify_main;
+ vnet_classify_main_t * vcm = cm->vnet_classify_main;
+
+ /* Assume that we've validated sw_if_index in the API layer */
+
+ if (ip4_table_index != ~0 &&
+ pool_is_free_index (vcm->tables, ip4_table_index))
+ return VNET_API_ERROR_NO_SUCH_TABLE;
+
+ if (ip6_table_index != ~0 &&
+ pool_is_free_index (vcm->tables, ip6_table_index))
+ return VNET_API_ERROR_NO_SUCH_TABLE2;
+
+ if (other_table_index != ~0 &&
+ pool_is_free_index (vcm->tables, other_table_index))
+ return VNET_API_ERROR_NO_SUCH_TABLE3;
+
+ vec_validate
+ (cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_IP4],
+ sw_if_index);
+
+ vec_validate
+ (cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_IP6],
+ sw_if_index);
+
+ vec_validate
+ (cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_OTHER],
+ sw_if_index);
+
+ cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_IP4]
+ [sw_if_index] = ip4_table_index;
+
+ cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_IP6]
+ [sw_if_index] = ip6_table_index;
+
+ cm->classify_table_index_by_sw_if_index[L2_CLASSIFY_TABLE_OTHER]
+ [sw_if_index] = other_table_index;
+
+ return 0;
+}
+
+static clib_error_t *
+int_l2_classify_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index = ~0;
+ u32 ip4_table_index = ~0;
+ u32 ip6_table_index = ~0;
+ u32 other_table_index = ~0;
+ int rv;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "intfc %U", unformat_vnet_sw_interface,
+ vnm, &sw_if_index))
+ ;
+ else if (unformat (input, "ip4-table %d", &ip4_table_index))
+ ;
+ else if (unformat (input, "ip6-table %d", &ip6_table_index))
+ ;
+ else if (unformat (input, "other-table %d", &other_table_index))
+ ;
+ else
+ break;
+ }
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "interface must be specified");
+
+
+ if (ip4_table_index == ~0 && ip6_table_index == ~0
+ && other_table_index == ~0)
+ {
+ vlib_cli_output (vm, "L2 classification disabled");
+ vnet_l2_classify_enable_disable (sw_if_index, 0 /* enable */);
+ return 0;
+ }
+
+ rv = vnet_l2_classify_set_tables (sw_if_index, ip4_table_index,
+ ip6_table_index, other_table_index);
+ switch(rv)
+ {
+ case 0:
+ vnet_l2_classify_enable_disable (sw_if_index, 1 /* enable */);
+ break;
+
+ default:
+ return clib_error_return (0, "vnet_l2_classify_set_tables: %d",
+ rv);
+ break;
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (int_l2_classify_cli, static) = {
+ .path = "set interface l2 classify",
+ .short_help =
+ "set interface l2 classify intfc <int> [ip4-table <n>]\n"
+ " [ip6-table <n>] [other-table <n>]",
+ .function = int_l2_classify_command_fn,
+};
+
+
diff --git a/vnet/vnet/l2/l2_classify.h b/vnet/vnet/l2/l2_classify.h
new file mode 100644
index 00000000000..55c2fc8b00d
--- /dev/null
+++ b/vnet/vnet/l2/l2_classify.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_vnet_l2_classify_h__
+#define __included_vnet_l2_classify_h__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+#include <vnet/classify/vnet_classify.h>
+
+typedef enum {
+ L2_CLASSIFY_NEXT_DROP,
+ L2_CLASSIFY_NEXT_ETHERNET_INPUT,
+ L2_CLASSIFY_NEXT_IP4_INPUT,
+ L2_CLASSIFY_NEXT_IP6_INPUT,
+ L2_CLASSIFY_NEXT_LI,
+ L2_CLASSIFY_N_NEXT,
+} l2_classify_next_t;
+
+typedef enum {
+ L2_CLASSIFY_TABLE_IP4,
+ L2_CLASSIFY_TABLE_IP6,
+ L2_CLASSIFY_TABLE_OTHER,
+ L2_CLASSIFY_N_TABLES,
+} l2_classify_table_id_t;
+
+typedef struct {
+
+ // Next nodes for each feature
+ u32 feat_next_node_index[32];
+
+ /* Per-address-family classifier table vectors */
+ u32 * classify_table_index_by_sw_if_index [L2_CLASSIFY_N_TABLES];
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ vnet_classify_main_t * vnet_classify_main;
+} l2_classify_main_t;
+
+l2_classify_main_t l2_classify_main;
+
+vlib_node_registration_t l2_classify_node;
+
+void vnet_l2_classify_enable_disable (u32 sw_if_index,
+ int enable_disable);
+
+int vnet_l2_classify_set_tables (u32 sw_if_index, u32 ip4_table_index,
+ u32 ip6_table_index, u32 other_table_index);
+
+#endif /* __included_vnet_l2_classify_h__ */
diff --git a/vnet/vnet/l2/l2_efp_filter.c b/vnet/vnet/l2/l2_efp_filter.c
new file mode 100644
index 00000000000..a8bceca13fe
--- /dev/null
+++ b/vnet/vnet/l2/l2_efp_filter.c
@@ -0,0 +1,572 @@
+/*
+ * l2_efp_filter.c : layer 2 egress EFP Filter processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_output.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/cache.h>
+
+typedef struct {
+
+ // Next nodes for features and output interfaces
+ l2_output_next_nodes_st next_nodes;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_efp_filter_main_t;
+
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u8 raw[12]; // raw data (vlans)
+ u32 sw_if_index;
+} l2_efp_filter_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_efp_filter_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_efp_filter_trace_t * t = va_arg (*args, l2_efp_filter_trace_t *);
+
+ s = format (s, "l2-output-vtr: sw_if_index %d dst %U src %U data "
+ "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->raw[0], t->raw[1], t->raw[2], t->raw[3], t->raw[4], t->raw[5],
+ t->raw[6], t->raw[7], t->raw[8], t->raw[9], t->raw[10], t->raw[11]);
+ return s;
+}
+
+l2_efp_filter_main_t l2_efp_filter_main;
+
+static vlib_node_registration_t l2_efp_filter_node;
+
+#define foreach_l2_efp_filter_error \
+_(L2_EFP_FILTER, "L2 EFP filter packets") \
+_(DROP, "L2 EFP filter post-rewrite drops")
+
+typedef enum {
+#define _(sym,str) L2_EFP_FILTER_ERROR_##sym,
+ foreach_l2_efp_filter_error
+#undef _
+ L2_EFP_FILTER_N_ERROR,
+} l2_efp_filter_error_t;
+
+static char * l2_efp_filter_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_efp_filter_error
+#undef _
+};
+
+typedef enum {
+ L2_EFP_FILTER_NEXT_DROP,
+ L2_EFP_FILTER_N_NEXT,
+} l2_efp_filter_next_t;
+
+
+// Extract fields from the packet that will be used in interface classification
+static_always_inline void
+extract_keys (vnet_main_t * vnet_main,
+ u32 sw_if_index0,
+ vlib_buffer_t * b0,
+ u32 * port_sw_if_index0,
+ u16 * first_ethertype0,
+ u16 * outer_id0,
+ u16 * inner_id0,
+ u32 * match_flags0)
+{
+ ethernet_header_t * e0;
+ ethernet_vlan_header_t * h0;
+ u32 tag_len;
+ u32 tag_num;
+
+ *port_sw_if_index0 = vnet_get_sup_sw_interface (vnet_main, sw_if_index0)->sw_if_index;
+
+ e0 = vlib_buffer_get_current (b0);
+ h0 = (ethernet_vlan_header_t *)(e0+1);
+
+ *first_ethertype0 = clib_net_to_host_u16(e0->type);
+ *outer_id0 = clib_net_to_host_u16 (h0[0].priority_cfi_and_id);
+ *inner_id0 = clib_net_to_host_u16 (h0[1].priority_cfi_and_id);
+
+ tag_len = vnet_buffer(b0)->l2.l2_len - sizeof(ethernet_header_t);
+ tag_num = tag_len / sizeof(ethernet_vlan_header_t);
+ *match_flags0 = eth_create_valid_subint_match_flags (tag_num);
+}
+
+/*
+ * EFP filtering is a basic switch feature which prevents an interface from
+ * transmitting a packet that doesn't match the interface's ingress match
+ * criteria. The check has two parts, one performed before egress vlan tag
+ * rewrite and one after.
+ *
+ * The pre-rewrite check insures the packet matches what an ingress packet looks
+ * like after going through the interface's ingress tag rewrite operation. Only
+ * pushed tags are compared. So:
+ * - if the ingress vlan tag rewrite pushes no tags (or is not enabled),
+ * any packet passes the filter
+ * - if the ingress vlan tag rewrite pushes one tag,
+ * the packet must have at least one tag, and the outer tag must match the pushed tag
+ * - if the ingress vlan tag rewrite pushes two tags,
+ * the packet must have at least two tags, and the outer two tags must match the pushed tags
+ *
+ * The pre-rewrite check is performed in the l2-output node.
+ *
+ * The post-rewrite check insures the packet matches what an ingress packet looks
+ * like before going through the interface's ingress tag rewrite operation. It verifies
+ * that such a packet arriving on the wire at this port would be classified as arriving
+ * an input interface equal to the packet's output interface. This can be done by running
+ * the output packet's vlan tags and output port through the interface classification,
+ * and checking if the resulting interface matches the output interface.
+ *
+ * The post-rewrite check is performed here.
+ */
+
+static uword
+l2_efp_filter_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_efp_filter_next_t next_index;
+ l2_efp_filter_main_t * msm = &l2_efp_filter_main;
+ vlib_node_t *n = vlib_get_node (vm, l2_efp_filter_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ u32 cached_sw_if_index = ~0;
+ u32 cached_next_index = ~0;
+
+ /* invalidate cache to begin with */
+ cached_sw_if_index = ~0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 6 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ u32 feature_bitmap0, feature_bitmap1;
+ u16 first_ethertype0, first_ethertype1;
+ u16 outer_id0, inner_id0, outer_id1, inner_id1;
+ u32 match_flags0, match_flags1;
+ u32 port_sw_if_index0, subint_sw_if_index0, port_sw_if_index1, subint_sw_if_index1;
+ vnet_hw_interface_t * hi0, * hi1;
+ main_intf_t * main_intf0, * main_intf1;
+ vlan_intf_t * vlan_intf0, * vlan_intf1;
+ qinq_intf_t * qinq_intf0, * qinq_intf1;
+ u32 is_l20, is_l21;
+ __attribute__((unused)) u32 matched0, matched1;
+ u8 error0, error1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3, * p4, * p5;
+ __attribute__((unused)) u32 sw_if_index2, sw_if_index3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+
+ // Prefetch the buffer header and packet for the N+2 loop iteration
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+
+ CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE);
+
+ // Prefetch the input config for the N+1 loop iteration
+ // This depends on the buffer header above
+ sw_if_index2 = vnet_buffer(p2)->sw_if_index[VLIB_TX];
+ sw_if_index3 = vnet_buffer(p3)->sw_if_index[VLIB_TX];
+ //TODO CLIB_PREFETCH (vec_elt_at_index(l2output_main.configs, sw_if_index2), CLIB_CACHE_LINE_BYTES, LOAD);
+ //TODO CLIB_PREFETCH (vec_elt_at_index(l2output_main.configs, sw_if_index3), CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* TX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_TX];
+
+ // process 2 packets
+ em->counters[node_counter_base_index + L2_EFP_FILTER_ERROR_L2_EFP_FILTER] += 2;
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap0 = vnet_buffer(b0)->l2.feature_bitmap & ~L2OUTPUT_FEAT_EFP_FILTER;
+ feature_bitmap1 = vnet_buffer(b1)->l2.feature_bitmap & ~L2OUTPUT_FEAT_EFP_FILTER;
+
+ // Determine next node
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2_efp_filter_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b0,
+ sw_if_index0,
+ feature_bitmap0,
+ &next0);
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2_efp_filter_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b1,
+ sw_if_index1,
+ feature_bitmap1,
+ &next1);
+
+ // perform the efp filter check on two packets
+
+ extract_keys (msm->vnet_main,
+ sw_if_index0,
+ b0,
+ &port_sw_if_index0,
+ &first_ethertype0,
+ &outer_id0,
+ &inner_id0,
+ &match_flags0);
+
+ extract_keys (msm->vnet_main,
+ sw_if_index1,
+ b1,
+ &port_sw_if_index1,
+ &first_ethertype1,
+ &outer_id1,
+ &inner_id1,
+ &match_flags1);
+
+ eth_vlan_table_lookups (&ethernet_main,
+ msm->vnet_main,
+ port_sw_if_index0,
+ first_ethertype0,
+ outer_id0,
+ inner_id0,
+ &hi0,
+ &main_intf0,
+ &vlan_intf0,
+ &qinq_intf0);
+
+ eth_vlan_table_lookups (&ethernet_main,
+ msm->vnet_main,
+ port_sw_if_index1,
+ first_ethertype1,
+ outer_id1,
+ inner_id1,
+ &hi1,
+ &main_intf1,
+ &vlan_intf1,
+ &qinq_intf1);
+
+ matched0 = eth_identify_subint (hi0,
+ b0,
+ match_flags0,
+ main_intf0,
+ vlan_intf0,
+ qinq_intf0,
+ &subint_sw_if_index0,
+ &error0,
+ &is_l20);
+
+ matched1 = eth_identify_subint (hi1,
+ b1,
+ match_flags1,
+ main_intf1,
+ vlan_intf1,
+ qinq_intf1,
+ &subint_sw_if_index1,
+ &error1,
+ &is_l21);
+
+ if (PREDICT_FALSE (sw_if_index0 != subint_sw_if_index0)) {
+ // Drop packet
+ next0 = L2_EFP_FILTER_NEXT_DROP;
+ b0->error = node->errors[L2_EFP_FILTER_ERROR_DROP];
+ }
+
+ if (PREDICT_FALSE (sw_if_index1 != subint_sw_if_index1)) {
+ // Drop packet
+ next1 = L2_EFP_FILTER_NEXT_DROP;
+ b1->error = node->errors[L2_EFP_FILTER_ERROR_DROP];
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED) {
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ l2_efp_filter_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ memcpy(t->raw, &h0->type, sizeof(t->raw));
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED) {
+ ethernet_header_t * h1 = vlib_buffer_get_current (b1);
+ l2_efp_filter_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ memcpy(t->raw, &h1->type, sizeof(t->raw));
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ u32 feature_bitmap0;
+ u16 first_ethertype0;
+ u16 outer_id0, inner_id0;
+ u32 match_flags0;
+ u32 port_sw_if_index0, subint_sw_if_index0;
+ vnet_hw_interface_t * hi0;
+ main_intf_t * main_intf0;
+ vlan_intf_t * vlan_intf0;
+ qinq_intf_t * qinq_intf0;
+ u32 is_l20;
+ __attribute__((unused)) u32 matched0;
+ u8 error0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+
+ // process 1 packet
+ em->counters[node_counter_base_index + L2_EFP_FILTER_ERROR_L2_EFP_FILTER] += 1;
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap0 = vnet_buffer(b0)->l2.feature_bitmap & ~L2OUTPUT_FEAT_EFP_FILTER;
+
+ // Determine next node
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2_efp_filter_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b0,
+ sw_if_index0,
+ feature_bitmap0,
+ &next0);
+
+ // perform the efp filter check on one packet
+
+ extract_keys (msm->vnet_main,
+ sw_if_index0,
+ b0,
+ &port_sw_if_index0,
+ &first_ethertype0,
+ &outer_id0,
+ &inner_id0,
+ &match_flags0);
+
+ eth_vlan_table_lookups (&ethernet_main,
+ msm->vnet_main,
+ port_sw_if_index0,
+ first_ethertype0,
+ outer_id0,
+ inner_id0,
+ &hi0,
+ &main_intf0,
+ &vlan_intf0,
+ &qinq_intf0);
+
+ matched0 = eth_identify_subint (hi0,
+ b0,
+ match_flags0,
+ main_intf0,
+ vlan_intf0,
+ qinq_intf0,
+ &subint_sw_if_index0,
+ &error0,
+ &is_l20);
+
+ if (PREDICT_FALSE (sw_if_index0 != subint_sw_if_index0)) {
+ // Drop packet
+ next0 = L2_EFP_FILTER_NEXT_DROP;
+ b0->error = node->errors[L2_EFP_FILTER_ERROR_DROP];
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ l2_efp_filter_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ memcpy(t->raw, &h0->type, sizeof(t->raw));
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2_efp_filter_node,static) = {
+ .function = l2_efp_filter_node_fn,
+ .name = "l2-efp-filter",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_efp_filter_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_efp_filter_error_strings),
+ .error_strings = l2_efp_filter_error_strings,
+
+ .n_next_nodes = L2_EFP_FILTER_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_EFP_FILTER_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2_efp_filter_init (vlib_main_t *vm)
+{
+ l2_efp_filter_main_t * mp = &l2_efp_filter_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2_efp_filter_node.index,
+ L2OUTPUT_N_FEAT,
+ l2output_get_feat_names(),
+ mp->next_nodes.feat_next_node_index);
+
+ // Initialize the output node mapping table
+ l2output_init_output_node_vec(&mp->next_nodes.output_node_index_vec);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_efp_filter_init);
+
+
+// Enable/disable the EFP Filter check on the subinterface
+void l2_efp_filter_configure (vnet_main_t * vnet_main,
+ u32 sw_if_index,
+ u32 enable)
+{
+ // set the interface flag
+ l2output_intf_bitmap_enable(sw_if_index, L2OUTPUT_FEAT_EFP_FILTER, enable);
+}
+
+
+// set subinterface egress efp filter enable/disable
+// The CLI format is:
+// set interface l2 efp-filter <interface> [disable]]
+static clib_error_t *
+int_l2_efp_filter (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // enable/disable the feature
+ l2_efp_filter_configure (vnm, sw_if_index, enable);
+
+ done:
+ return error;
+}
+
+
+VLIB_CLI_COMMAND (int_l2_efp_filter_cli, static) = {
+ .path = "set interface l2 efp-filter",
+ .short_help = "set interface l2 efp-filter <interface> [disable]",
+ .function = int_l2_efp_filter,
+};
+
diff --git a/vnet/vnet/l2/l2_efp_filter.h b/vnet/vnet/l2/l2_efp_filter.h
new file mode 100644
index 00000000000..f8baf092fa8
--- /dev/null
+++ b/vnet/vnet/l2/l2_efp_filter.h
@@ -0,0 +1,28 @@
+/*
+ * l2_efp_filter.h : layer 2 egress EFP Filter processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef included_vnet_l2_efp_filter_h
+#define included_vnet_l2_efp_filter_h
+
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+
+#endif
+
diff --git a/vnet/vnet/l2/l2_fib.c b/vnet/vnet/l2/l2_fib.c
new file mode 100644
index 00000000000..198ffd281bb
--- /dev/null
+++ b/vnet/vnet/l2/l2_fib.c
@@ -0,0 +1,567 @@
+/*
+ * l2_fib.c : layer 2 forwarding table (aka mac table)
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vlib/cli.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/l2/l2_fib.h>
+#include <vnet/l2/l2_learn.h>
+#include <vnet/l2/l2_bd.h>
+
+#include <vppinfra/bihash_template.c>
+
+typedef struct {
+
+ /* hash table */
+ BVT(clib_bihash) mac_table;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2fib_main_t;
+
+l2fib_main_t l2fib_main;
+
+
+// Format sw_if_index. If the value is ~0, use the text "N/A"
+u8 * format_vnet_sw_if_index_name_with_NA (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ u32 sw_if_index = va_arg (*args, u32);
+ if (sw_if_index == ~0)
+ return format (s, "N/A");
+ else
+ return format (s, "%U",
+ format_vnet_sw_interface_name, vnm,
+ vnet_get_sw_interface (vnm, sw_if_index));
+}
+
+void l2fib_table_dump (u32 bd_index, l2fib_entry_key_t **l2fe_key,
+ l2fib_entry_result_t **l2fe_res)
+{
+ l2fib_main_t * msm = &l2fib_main;
+ BVT(clib_bihash) * h = &msm->mac_table;
+ clib_bihash_bucket_t * b;
+ BVT(clib_bihash_value) * v;
+ l2fib_entry_key_t key;
+ l2fib_entry_result_t result;
+ int i, j, k;
+
+ for (i = 0; i < h->nbuckets; i++)
+ {
+ b = &h->buckets[i];
+ if (b->offset == 0)
+ continue;
+ v = BV(clib_bihash_get_value) (h, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
+ {
+ if (v->kvp[k].key == ~0ULL && v->kvp[k].value == ~0ULL)
+ continue;
+
+ key.raw = v->kvp[k].key;
+ result.raw = v->kvp[k].value;
+
+ if ((bd_index == ~0) || (bd_index == key.fields.bd_index))
+ {
+ vec_add1 (*l2fe_key, key);
+ vec_add1 (*l2fe_res, result);
+ }
+ }
+ v++;
+ }
+ }
+}
+
+// Display the contents of the l2fib
+static clib_error_t *
+show_l2fib (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ l2fib_main_t * msm = &l2fib_main;
+ BVT(clib_bihash) * h = &msm->mac_table;
+ clib_bihash_bucket_t * b;
+ BVT(clib_bihash_value) * v;
+ l2fib_entry_key_t key;
+ l2fib_entry_result_t result;
+ u32 first_entry = 1;
+ u64 total_entries = 0;
+ int i, j, k;
+ u8 verbose = 0;
+ u8 raw = 0;
+ u32 bd_id, bd_index = ~0;
+
+ if (unformat (input, "raw"))
+ raw = 1;
+ else if (unformat (input, "verbose"))
+ verbose = 1;
+ else if (unformat (input, "bd_index %d", &bd_index))
+ verbose = 1;
+ else if (unformat (input, "bd_id %d", &bd_id))
+ {
+ uword *p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (p)
+ {
+ verbose = 1;
+ bd_index = p[0];
+ }
+ else
+ {
+ vlib_cli_output (vm, "no such bridge domain id");
+ return 0;
+ }
+ }
+
+ for (i = 0; i < h->nbuckets; i++)
+ {
+ b = &h->buckets[i];
+ if (b->offset == 0)
+ continue;
+ v = BV(clib_bihash_get_value) (h, b->offset);
+ for (j = 0; j < (1<<b->log2_pages); j++)
+ {
+ for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
+ {
+ if (v->kvp[k].key == ~0ULL && v->kvp[k].value == ~0ULL)
+ continue;
+
+ if (verbose && first_entry)
+ {
+ first_entry=0;
+ vlib_cli_output (vm,
+ "%=19s%=7s%=30s%=7s%=8s%=8s%=5s%=9s%=11s",
+ "Mac Address", "BD Idx", "Interface",
+ "Index", "static", "filter", "bvi",
+ "refresh", "timestamp");
+ }
+
+ key.raw = v->kvp[k].key;
+ result.raw = v->kvp[k].value;
+
+ if (verbose
+ & ((bd_index >>31) || (bd_index == key.fields.bd_index)))
+ {
+ vlib_cli_output (vm,
+ "%=19U%=7d%=30U%=7d%=8d%=8d%=5d%=9d%=11X",
+ format_ethernet_address, key.fields.mac,
+ key.fields.bd_index,
+ format_vnet_sw_if_index_name_with_NA,
+ msm->vnet_main, result.fields.sw_if_index,
+ result.fields.sw_if_index == ~0
+ ? -1 : result.fields.sw_if_index,
+ result.fields.static_mac,
+ result.fields.filter,
+ result.fields.bvi,
+ result.fields.refresh,
+ result.fields.timestamp);
+ }
+ total_entries++;
+ }
+ v++;
+ }
+ }
+
+ if (total_entries == 0)
+ vlib_cli_output (vm, "no l2fib entries");
+ else
+ vlib_cli_output (vm, "%lld l2fib entries", total_entries);
+
+ if (raw)
+ vlib_cli_output (vm, "Raw Hash Table:\n%U\n",
+ BV(format_bihash), h, 1 /* verbose */);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_l2fib_cli, static) = {
+ .path = "show l2fib",
+ .short_help = "show l2fib [verbose | bd_id <nn> | bd_index <nn> | raw]",
+ .function = show_l2fib,
+};
+
+
+// Remove all entries from the l2fib
+void l2fib_clear_table (uint keep_static)
+{
+ l2fib_main_t * mp = &l2fib_main;
+
+ if (keep_static) {
+ // TODO: remove only non-static entries
+ } else {
+ // Remove all entries
+ BV(clib_bihash_free) (&mp->mac_table);
+ BV(clib_bihash_init) (&mp->mac_table, "l2fib mac table",
+ L2FIB_NUM_BUCKETS, L2FIB_MEMORY_SIZE);
+ }
+
+ l2learn_main.global_learn_count = 0;
+}
+
+// Clear all entries in L2FIB
+// TODO: Later we may want a way to remove only the non-static entries
+static clib_error_t *
+clear_l2fib (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2fib_clear_table (0);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_l2fib_cli, static) = {
+ .path = "clear l2fib",
+ .short_help = "Clear l2fib mac forwarding entries",
+ .function = clear_l2fib,
+};
+
+
+// Add an entry to the l2fib.
+// If the entry already exists then overwrite it
+void l2fib_add_entry (u64 mac,
+ u32 bd_index,
+ u32 sw_if_index,
+ u32 static_mac,
+ u32 filter_mac,
+ u32 bvi_mac) {
+ l2fib_entry_key_t key;
+ l2fib_entry_result_t result;
+ __attribute__((unused)) u32 bucket_contents;
+ l2fib_main_t * mp = &l2fib_main;
+ BVT(clib_bihash_kv) kv;
+
+ // set up key
+ key.raw = l2fib_make_key ((u8 *)&mac, bd_index);
+
+ // set up result
+ result.raw = 0; // clear all fields
+ result.fields.sw_if_index = sw_if_index;
+ result.fields.static_mac = static_mac;
+ result.fields.filter = filter_mac;
+ result.fields.bvi = bvi_mac;
+
+ kv.key = key.raw;
+ kv.value = result.raw;
+
+ BV(clib_bihash_add_del) (&mp->mac_table, &kv, 1 /* is_add */);
+
+ // increment counter if dynamically learned mac
+ if (result.fields.static_mac) {
+ l2learn_main.global_learn_count++;
+ }
+}
+
+// Add an entry to the L2FIB
+// The CLI format is:
+// l2fib add <mac> <bd> <intf> [static] [bvi]
+// l2fib add <mac> <bd> filter
+// Note that filter and bvi entries are always static
+static clib_error_t *
+l2fib_add (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u64 mac;
+ u32 bd_id;
+ u32 bd_index;
+ u32 sw_if_index = ~0;
+ u32 filter_mac = 0;
+ u32 static_mac = 0;
+ u32 bvi_mac = 0;
+ uword * p;
+
+ if (! unformat_user (input, unformat_ethernet_address, &mac))
+ {
+ error = clib_error_return (0, "expected mac address `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (!unformat (input, "%d", &bd_id)) {
+ error = clib_error_return (0, "expected bridge domain ID `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (!p) {
+ error = clib_error_return (0, "bridge domain ID %d invalid", bd_id);
+ goto done;
+ }
+ bd_index = p[0];
+
+ if (unformat (input, "filter")) {
+ filter_mac = 1;
+ static_mac = 1;
+
+ } else {
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index)) {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ if (unformat (input, "static")) {
+ static_mac = 1;
+ } else if (unformat (input, "bvi")) {
+ bvi_mac = 1;
+ static_mac = 1;
+ }
+ }
+
+ l2fib_add_entry(mac, bd_index, sw_if_index, static_mac, filter_mac, bvi_mac);
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (l2fib_add_cli, static) = {
+ .path = "l2fib add",
+ .short_help = "Add l2fib mac forwarding entry <mac> <bd-id> filter | <intf> [static | bvi]",
+ .function = l2fib_add,
+};
+
+
+static clib_error_t *
+l2fib_test_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ u64 mac, save_mac;
+ u32 bd_index = 0;
+ u32 sw_if_index = 8;
+ u32 filter_mac = 0;
+ u32 bvi_mac = 0;
+ u32 is_add = 0;
+ u32 is_del = 0;
+ u32 is_check = 0;
+ u32 count = 1;
+ int mac_set = 0;
+ int i;
+
+ while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "mac %U", unformat_ethernet_address, &mac))
+ mac_set = 1;
+ else if (unformat (input, "add"))
+ is_add = 1;
+ else if (unformat (input, "del"))
+ is_del = 1;
+ else if (unformat (input, "check"))
+ is_check = 1;
+ else if (unformat (input, "count %d", &count))
+ ;
+ else
+ break;
+ }
+
+ if (mac_set == 0)
+ return clib_error_return (0, "mac not set");
+
+ if (is_add == 0 && is_del == 0 && is_check == 0)
+ return clib_error_return (0, "noop: pick at least one of (add,del,check)");
+
+ save_mac = mac;
+
+ if (is_add)
+ {
+ for (i = 0; i < count; i++)
+ {
+ u64 tmp;
+ l2fib_add_entry(mac, bd_index, sw_if_index, mac,
+ filter_mac, bvi_mac);
+ tmp = clib_net_to_host_u64(mac);
+ tmp >>= 16;
+ tmp++;
+ tmp <<= 16;
+ mac = clib_host_to_net_u64 (tmp);
+ }
+ }
+
+ if (is_check)
+ {
+ BVT(clib_bihash_kv) kv;
+ l2fib_main_t * mp = &l2fib_main;
+
+ mac = save_mac;
+
+ for (i = 0; i < count; i++)
+ {
+ u64 tmp;
+ kv.key = l2fib_make_key ((u8 *)&mac, bd_index);
+ if (BV(clib_bihash_search) (&mp->mac_table, &kv, &kv))
+ {
+ clib_warning ("key %U AWOL", format_ethernet_address, &mac);
+ break;
+ }
+ tmp = clib_net_to_host_u64(mac);
+ tmp >>= 16;
+ tmp++;
+ tmp <<= 16;
+ mac = clib_host_to_net_u64 (tmp);
+ }
+ }
+
+ if (is_del)
+ {
+ for (i = 0; i < count; i++)
+ {
+ u64 tmp;
+
+ l2fib_del_entry (mac, bd_index);
+
+ tmp = clib_net_to_host_u64(mac);
+ tmp >>= 16;
+ tmp++;
+ tmp <<= 16;
+ mac = clib_host_to_net_u64 (tmp);
+ }
+ }
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (l2fib_test_command, static) = {
+ .path = "test l2fib",
+ .short_help = "test l2fib [del] mac <base-addr> count <nn>",
+ .function = l2fib_test_command_fn,
+};
+
+
+// Delete an entry from the l2fib.
+// Return 0 if the entry was deleted, or 1 if it was not found
+u32 l2fib_del_entry (u64 mac,
+ u32 bd_index) {
+
+ l2fib_entry_result_t result;
+ l2fib_main_t * mp = &l2fib_main;
+ BVT(clib_bihash_kv) kv;
+
+ // set up key
+ kv.key = l2fib_make_key ((u8 *)&mac, bd_index);
+
+ if (BV(clib_bihash_search) (&mp->mac_table, &kv, &kv))
+ return 1;
+
+ result.raw = kv.value;
+
+ // decrement counter if dynamically learned mac
+ if (result.fields.static_mac) {
+ if (l2learn_main.global_learn_count > 0) {
+ l2learn_main.global_learn_count--;
+ }
+ }
+
+ // Remove entry from hash table
+ BV(clib_bihash_add_del) (&mp->mac_table, &kv, 0 /* is_add */);
+ return 0;
+}
+
+// Delete an entry from the L2FIB
+// The CLI format is:
+// l2fib del <mac> <bd-id>
+static clib_error_t *
+l2fib_del (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ bd_main_t * bdm = &bd_main;
+ clib_error_t * error = 0;
+ u64 mac;
+ u32 bd_id;
+ u32 bd_index;
+ uword * p;
+
+ if (! unformat_user (input, unformat_ethernet_address, &mac))
+ {
+ error = clib_error_return (0, "expected mac address `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (!unformat (input, "%d", &bd_id)) {
+ error = clib_error_return (0, "expected bridge domain ID `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ p = hash_get (bdm->bd_index_by_bd_id, bd_id);
+ if (!p) {
+ error = clib_error_return (0, "bridge domain ID %d invalid", bd_id);
+ goto done;
+ }
+ bd_index = p[0];
+
+ // Delete the entry
+ if (l2fib_del_entry(mac, bd_index)) {
+ error = clib_error_return (0, "mac entry not found");
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (l2fib_del_cli, static) = {
+ .path = "l2fib del",
+ .short_help = "Delete l2fib mac forwarding entry <mac> <bd-id>",
+ .function = l2fib_del,
+};
+
+
+BVT(clib_bihash) *get_mac_table(void) {
+ l2fib_main_t * mp = &l2fib_main;
+ return &mp->mac_table;
+}
+
+clib_error_t *l2fib_init (vlib_main_t *vm)
+{
+ l2fib_main_t * mp = &l2fib_main;
+ l2fib_entry_key_t test_key;
+ u8 test_mac[6];
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Create the hash table
+ BV(clib_bihash_init) (&mp->mac_table, "l2fib mac table",
+ L2FIB_NUM_BUCKETS, L2FIB_MEMORY_SIZE);
+
+ // verify the key constructor is good, since it is endian-sensitive
+ test_mac[0] = 0x11;
+ test_key.raw = 0;
+ test_key.raw = l2fib_make_key ((u8 *)&test_mac, 0x1234);
+ ASSERT (test_key.fields.mac[0] == 0x11);
+ ASSERT (test_key.fields.bd_index == 0x1234);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2fib_init);
+
diff --git a/vnet/vnet/l2/l2_fib.h b/vnet/vnet/l2/l2_fib.h
new file mode 100644
index 00000000000..1dcc0200f60
--- /dev/null
+++ b/vnet/vnet/l2/l2_fib.h
@@ -0,0 +1,226 @@
+/*
+ * l2_fib.h : layer 2 forwarding table (aka mac table)
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2fib_h
+#define included_l2fib_h
+
+#include <vlib/vlib.h>
+#include <vppinfra/bihash_8_8.h>
+
+/*
+ * The size of the hash table
+ */
+#define L2FIB_NUM_BUCKETS (64 * 1024)
+#define L2FIB_MEMORY_SIZE (256<<20)
+
+/*
+ * The L2fib key is the mac address and bridge domain ID
+ */
+typedef struct {
+ union {
+ struct {
+ u16 bd_index;
+ u8 mac[6];
+ } fields;
+ struct {
+ u32 w0;
+ u32 w1;
+ } words;
+ u64 raw;
+ };
+} l2fib_entry_key_t;
+
+/*
+ * The l2fib entry results
+ */
+typedef struct {
+ union {
+ struct {
+ u32 sw_if_index; // output sw_if_index (L3 interface if bvi==1)
+
+ u8 static_mac:1; // static mac, no dataplane learning
+ u8 bvi:1; // mac is for a bridged virtual interface
+ u8 filter:1; // drop packets to/from this mac
+ u8 refresh:1; // refresh flag for aging
+ u8 unused1:4;
+ u8 timestamp; // timestamp for aging
+ u16 unused2;
+ } fields;
+ u64 raw;
+ };
+} l2fib_entry_result_t;
+
+
+// Compute the hash for the given key and return the corresponding bucket index
+always_inline
+u32 l2fib_compute_hash_bucket (l2fib_entry_key_t *key) {
+ u32 result;
+ u32 temp_a;
+ u32 temp_b;
+
+ result = 0xa5a5a5a5; // some seed
+ temp_a = key->words.w0;
+ temp_b = key->words.w1;
+ hash_mix32(temp_a, temp_b, result);
+
+ return result % L2FIB_NUM_BUCKETS;
+}
+
+always_inline
+u64 l2fib_make_key (u8 * mac_address, u16 bd_index) {
+ u64 temp;
+
+ // The mac address in memory is A:B:C:D:E:F
+ // The bd id in register is H:L
+#if CLIB_ARCH_IS_LITTLE_ENDIAN
+ // Create the in-register key as F:E:D:C:B:A:H:L
+ // In memory the key is L:H:A:B:C:D:E:F
+ temp = *((u64 *)(mac_address - 2));
+ temp = (temp & ~0xffff) | (u64)(bd_index);
+#else
+ // Create the in-register key as H:L:A:B:C:D:E:F
+ // In memory the key is H:L:A:B:C:D:E:F
+ temp = *((u64 *)(mac_address)) >> 16;
+ temp = temp | (bd_index << 48);
+#endif
+
+ return temp;
+}
+
+
+
+// Lookup the entry for mac and bd_index in the mac table for 1 packet.
+// Cached_key and cached_result are used as a one-entry cache.
+// The function reads and updates them as needed.
+//
+// mac0 and bd_index0 are the keys. The entry is written to result0.
+// If the entry was not found, result0 is set to ~0.
+//
+// key0 and bucket0 return with the computed key and hash bucket,
+// convenient if the entry needs to be updated afterward.
+// If the cached_result was used, bucket0 is set to ~0.
+
+static_always_inline void
+l2fib_lookup_1 (BVT(clib_bihash) * mac_table,
+ l2fib_entry_key_t * cached_key,
+ l2fib_entry_result_t * cached_result,
+ u8 * mac0,
+ u16 bd_index0,
+ l2fib_entry_key_t * key0,
+ u32 * bucket0,
+ l2fib_entry_result_t *result0)
+{
+ // set up key
+ key0->raw = l2fib_make_key (mac0, bd_index0);
+ *bucket0 = ~0;
+
+ if (key0->raw == cached_key->raw) {
+ // Hit in the one-entry cache
+ result0->raw = cached_result->raw;
+ } else {
+ // Do a regular mac table lookup
+ BVT(clib_bihash_kv) kv;
+
+ kv.key = key0->raw;
+ kv.value = ~0ULL;
+ BV(clib_bihash_search_inline) (mac_table, &kv);
+ result0->raw = kv.value;
+
+ // Update one-entry cache
+ cached_key->raw = key0->raw;
+ cached_result->raw = result0->raw;
+ }
+}
+
+
+// Lookup the entry for mac and bd_index in the mac table for 2 packets.
+// The lookups for the two packets are interleaved.
+//
+// Cached_key and cached_result are used as a one-entry cache.
+// The function reads and updates them as needed.
+//
+// mac0 and bd_index0 are the keys. The entry is written to result0.
+// If the entry was not found, result0 is set to ~0. The same
+// holds for mac1/bd_index1/result1.
+
+static_always_inline void
+l2fib_lookup_2 (BVT(clib_bihash) * mac_table,
+ l2fib_entry_key_t * cached_key,
+ l2fib_entry_result_t * cached_result,
+ u8 * mac0,
+ u8 * mac1,
+ u16 bd_index0,
+ u16 bd_index1,
+ l2fib_entry_key_t * key0,
+ l2fib_entry_key_t * key1,
+ u32 * bucket0,
+ u32 * bucket1,
+ l2fib_entry_result_t *result0,
+ l2fib_entry_result_t *result1)
+{
+ // set up key
+ key0->raw = l2fib_make_key (mac0, bd_index0);
+ key1->raw = l2fib_make_key (mac1, bd_index1);
+
+ if ((key0->raw == cached_key->raw) &&
+ (key1->raw == cached_key->raw)) {
+ // Both hit in the one-entry cache
+ result0->raw = cached_result->raw;
+ result1->raw = cached_result->raw;
+ *bucket0 = ~0;
+ *bucket1 = ~0;
+
+ } else {
+ BVT(clib_bihash_kv) kv0, kv1;
+
+ // Do a regular mac table lookup
+ // Interleave lookups for packet 0 and packet 1
+ kv0.key = key0->raw;
+ kv1.key = key1->raw;
+ kv0.value = ~0ULL;
+ kv1.value = ~0ULL;
+
+ BV(clib_bihash_search_inline) (mac_table, &kv0);
+ BV(clib_bihash_search_inline) (mac_table, &kv1);
+
+ result0->raw = kv0.value;
+ result1->raw = kv1.value;
+
+ // Update one-entry cache
+ cached_key->raw = key1->raw;
+ cached_result->raw = result1->raw;
+ }
+}
+
+
+BVT(clib_bihash) *get_mac_table(void);
+void l2fib_clear_table (uint keep_static);
+void l2fib_add_entry (u64 mac,
+ u32 bd_index,
+ u32 sw_if_index,
+ u32 static_mac,
+ u32 drop_mac,
+ u32 bvi_mac);
+u32 l2fib_del_entry (u64 mac,
+ u32 bd_index);
+
+void l2fib_table_dump (u32 bd_index, l2fib_entry_key_t **l2fe_key,
+ l2fib_entry_result_t **l2fe_res);
+
+u8 * format_vnet_sw_if_index_name_with_NA (u8 * s, va_list * args);
+
+#endif
diff --git a/vnet/vnet/l2/l2_flood.c b/vnet/vnet/l2/l2_flood.c
new file mode 100644
index 00000000000..8a702168715
--- /dev/null
+++ b/vnet/vnet/l2/l2_flood.c
@@ -0,0 +1,520 @@
+/*
+ * l2_flood.c : layer 2 flooding
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_bvi.h>
+#include <vnet/replication.h>
+#include <vnet/l2/l2_fib.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+
+
+/*
+ * Flooding uses the packet replication infrastructure to send a copy of the
+ * packet to each member interface. Logically the replication infrastructure
+ * expects two graph nodes: a prep node that initiates replication and sends the
+ * packet to the first destination, and a recycle node that is passed the packet
+ * after it has been transmitted.
+ *
+ * To decrease the amount of code, l2 flooding implements both functions in
+ * the same graph node. This node can tell if is it being called as the "prep"
+ * or "recycle" using replication_is_recycled().
+ */
+
+
+typedef struct {
+
+ // Next nodes for each feature
+ u32 feat_next_node_index[32];
+
+ // next node index for the L3 input node of each ethertype
+ next_by_ethertype_t l3_next;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2flood_main_t;
+
+typedef struct {
+ u8 src[6];
+ u8 dst[6];
+ u32 sw_if_index;
+ u16 bd_index;
+} l2flood_trace_t;
+
+
+/* packet trace format function */
+static u8 * format_l2flood_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2flood_trace_t * t = va_arg (*args, l2flood_trace_t *);
+
+ s = format (s, "l2-flood: sw_if_index %d dst %U src %U bd_index %d",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->bd_index);
+ return s;
+}
+
+l2flood_main_t l2flood_main;
+
+static vlib_node_registration_t l2flood_node;
+
+#define foreach_l2flood_error \
+_(L2FLOOD, "L2 flood packets") \
+_(REPL_FAIL, "L2 replication failures") \
+_(NO_MEMBERS, "L2 replication complete") \
+_(BVI_TAGGED, "BVI packet with vlan tag") \
+_(BVI_ETHERTYPE, "BVI packet with unhandled ethertype")
+
+typedef enum {
+#define _(sym,str) L2FLOOD_ERROR_##sym,
+ foreach_l2flood_error
+#undef _
+ L2FLOOD_N_ERROR,
+} l2flood_error_t;
+
+static char * l2flood_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2flood_error
+#undef _
+};
+
+typedef enum {
+ L2FLOOD_NEXT_L2_OUTPUT,
+ L2FLOOD_NEXT_DROP,
+ L2FLOOD_N_NEXT,
+} l2flood_next_t;
+
+/*
+ * Perform flooding on one packet
+ *
+ * Due to the way BVI processing can modify the packet, the BVI interface
+ * (if present) must be processed last in the replication. The member vector
+ * is arranged so that the BVI interface is always the first element.
+ * Flooding walks the vector in reverse.
+ *
+ * BVI processing causes the packet to go to L3 processing. This strips the
+ * L2 header, which is fine because the replication infrastructure restores
+ * it. However L3 processing can trigger larger changes to the packet. For
+ * example, an ARP request could be turned into an ARP reply, an ICMP request
+ * could be turned into an ICMP reply. If BVI processing is not performed
+ * last, the modified packet would be replicated to the remaining members.
+ */
+
+static_always_inline void
+l2flood_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ l2flood_main_t * msm,
+ u64 * counter_base,
+ vlib_buffer_t * b0,
+ u32 sw_if_index0,
+ l2fib_entry_key_t * key0,
+ u32 * bucket0,
+ l2fib_entry_result_t * result0,
+ u32 * next0)
+{
+ u16 bd_index0;
+ l2_bridge_domain_t *bd_config;
+ l2_flood_member_t * members;
+ i32 current_member; // signed
+ replication_context_t * ctx;
+ u8 in_shg = vnet_buffer(b0)->l2.shg;
+
+ if (!replication_is_recycled(b0)) {
+
+ // Do flood "prep node" processing
+
+ // Get config for the bridge domain interface
+ bd_index0 = vnet_buffer(b0)->l2.bd_index;
+ bd_config = vec_elt_at_index(l2input_main.bd_configs, bd_index0);
+ members = bd_config->members;
+
+ // Find first member that passes the reflection and SHG checks
+ current_member = vec_len(members) - 1;
+ while ((current_member >= 0) &&
+ ((members[current_member].sw_if_index == sw_if_index0) ||
+ (in_shg && members[current_member].shg == in_shg))) {
+ current_member--;
+ }
+
+ if (current_member < 0) {
+ // No members to flood to
+ *next0 = L2FLOOD_NEXT_DROP;
+ b0->error = node->errors[L2FLOOD_ERROR_NO_MEMBERS];
+ return;
+ }
+
+ if ((current_member > 0) &&
+ ((current_member > 1) ||
+ ((members[0].sw_if_index != sw_if_index0) &&
+ (!in_shg || members[0].shg != in_shg)))) {
+ // If more than one member then initiate replication
+ ctx = replication_prep (vm, b0, l2flood_node.index, 1 /* l2_packet */);
+ ctx->feature_replicas = (u64) members;
+ ctx->feature_counter = current_member;
+ }
+
+ } else {
+
+ // Do flood "recycle node" processing
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_REPL_FAIL))
+ {
+ (void)replication_recycle (vm, b0, 1 /* is_last */);
+ *next0 = L2FLOOD_NEXT_DROP;
+ b0->error = node->errors[L2FLOOD_ERROR_REPL_FAIL];
+ return;
+ }
+
+ ctx = replication_get_ctx (b0);
+ replication_clear_recycled (b0);
+
+ members = (l2_flood_member_t *) ctx->feature_replicas;
+ current_member = (i32)ctx->feature_counter - 1;
+
+ // Find next member that passes the reflection and SHG check
+ while ((current_member >= 0) &&
+ ((members[current_member].sw_if_index == sw_if_index0) ||
+ (in_shg && members[current_member].shg == in_shg))) {
+ current_member--;
+ }
+
+ if (current_member < 0) {
+ // No more members to flood to.
+ // Terminate replication and drop packet.
+
+ replication_recycle (vm, b0, 1 /* is_last */);
+
+ *next0 = L2FLOOD_NEXT_DROP;
+ // Ideally we woudn't bump a counter here, just silently complete
+ b0->error = node->errors[L2FLOOD_ERROR_NO_MEMBERS];
+ return;
+ }
+
+ // Restore packet and context and continue replication
+ ctx->feature_counter = current_member;
+ replication_recycle (vm, b0,
+ ((current_member == 0) || /*is_last */
+ ((current_member == 1) &&
+ ((members[0].sw_if_index == sw_if_index0) ||
+ (in_shg && members[0].shg == in_shg)))));
+ }
+
+ // Forward packet to the current member
+
+ if (PREDICT_TRUE(members[current_member].flags == L2_FLOOD_MEMBER_NORMAL)) {
+ // Do normal L2 forwarding
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = members[current_member].sw_if_index;
+ *next0 = L2FLOOD_NEXT_L2_OUTPUT;
+
+ } else {
+ // Do BVI processing
+ u32 rc;
+ rc = l2_to_bvi (vm,
+ msm->vnet_main,
+ b0,
+ members[current_member].sw_if_index,
+ &msm->l3_next,
+ next0);
+
+ if (PREDICT_FALSE(rc)) {
+ if (rc == TO_BVI_ERR_TAGGED) {
+ b0->error = node->errors[L2FLOOD_ERROR_BVI_TAGGED];
+ *next0 = L2FLOOD_NEXT_DROP;
+ } else if (rc == TO_BVI_ERR_ETHERTYPE) {
+ b0->error = node->errors[L2FLOOD_ERROR_BVI_ETHERTYPE];
+ *next0 = L2FLOOD_NEXT_DROP;
+ }
+ }
+ }
+
+}
+
+
+static uword
+l2flood_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2flood_next_t next_index;
+ l2flood_main_t * msm = &l2flood_main;
+ vlib_node_t *n = vlib_get_node (vm, l2flood_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 6 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ ethernet_header_t * h0, * h1;
+ l2fib_entry_key_t key0, key1;
+ l2fib_entry_result_t result0, result1;
+ u32 bucket0, bucket1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3, * p4, * p5;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+
+ // Prefetch the buffer header for the N+2 loop iteration
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+
+ // Prefetch the replication context for the N+1 loop iteration
+ // This depends on the buffer header above
+ replication_prefetch_ctx (p2);
+ replication_prefetch_ctx (p3);
+
+ // Prefetch the packet for the N+1 loop iteration
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* RX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* Process 2 x pkts */
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ /* process 2 pkts */
+ em->counters[node_counter_base_index + L2FLOOD_ERROR_L2FLOOD] += 2;
+
+ l2flood_process (vm, node, msm, &em->counters[node_counter_base_index],
+ b0, sw_if_index0, &key0, &bucket0, &result0, &next0);
+
+ l2flood_process (vm, node, msm, &em->counters[node_counter_base_index],
+ b1, sw_if_index1, &key1, &bucket1, &result1, &next1);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ l2flood_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ l2flood_trace_t *t = vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->bd_index = vnet_buffer(b1)->l2.bd_index;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ethernet_header_t * h0;
+ l2fib_entry_key_t key0;
+ l2fib_entry_result_t result0;
+ u32 bucket0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ h0 = vlib_buffer_get_current (b0);
+
+ /* process 1 pkt */
+ em->counters[node_counter_base_index + L2FLOOD_ERROR_L2FLOOD] += 1;
+
+ l2flood_process (vm, node, msm, &em->counters[node_counter_base_index],
+ b0, sw_if_index0, &key0, &bucket0, &result0, &next0);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) &&
+ (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2flood_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2flood_node,static) = {
+ .function = l2flood_node_fn,
+ .name = "l2-flood",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2flood_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2flood_error_strings),
+ .error_strings = l2flood_error_strings,
+
+ .n_next_nodes = L2FLOOD_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2FLOOD_NEXT_L2_OUTPUT] = "l2-output",
+ [L2FLOOD_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2flood_init (vlib_main_t *vm)
+{
+ l2flood_main_t * mp = &l2flood_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2flood_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ mp->feat_next_node_index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2flood_init);
+
+
+
+// Add the L3 input node for this ethertype to the next nodes structure
+void
+l2flood_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index)
+{
+ l2flood_main_t * mp = &l2flood_main;
+ u32 next_index;
+
+ next_index = vlib_node_add_next (vm,
+ l2flood_node.index,
+ node_index);
+
+ next_by_ethertype_register (&mp->l3_next, type, next_index);
+}
+
+
+// set subinterface flood enable/disable
+// The CLI format is:
+// set interface l2 flood <interface> [disable]
+static clib_error_t *
+int_flood (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the interface flag
+ l2input_intf_bitmap_enable(sw_if_index, L2INPUT_FEAT_FLOOD, enable);
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_flood_cli, static) = {
+ .path = "set interface l2 flood",
+ .short_help = "set interface l2 flood <interface> [disable]",
+ .function = int_flood,
+};
diff --git a/vnet/vnet/l2/l2_flood.h b/vnet/vnet/l2/l2_flood.h
new file mode 100644
index 00000000000..3c9273d48d5
--- /dev/null
+++ b/vnet/vnet/l2/l2_flood.h
@@ -0,0 +1,28 @@
+/*
+ * l2_flood.h : layer 2 flooding
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2flood_h
+#define included_l2flood_h
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+void
+l2flood_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index);
+#endif
diff --git a/vnet/vnet/l2/l2_fwd.c b/vnet/vnet/l2/l2_fwd.c
new file mode 100644
index 00000000000..089d4008ea8
--- /dev/null
+++ b/vnet/vnet/l2/l2_fwd.c
@@ -0,0 +1,446 @@
+/*
+ * l2_fwd.c : layer 2 forwarding using l2fib
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vlib/cli.h>
+
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/l2_bvi.h>
+#include <vnet/l2/l2_fwd.h>
+#include <vnet/l2/l2_fib.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/sparse_vec.h>
+
+
+typedef struct {
+
+ // Hash table
+ BVT(clib_bihash) *mac_table;
+
+ // next node index for the L3 input node of each ethertype
+ next_by_ethertype_t l3_next;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2fwd_main_t;
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u32 sw_if_index;
+ u16 bd_index;
+} l2fwd_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2fwd_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2fwd_trace_t * t = va_arg (*args, l2fwd_trace_t *);
+
+ s = format (s, "l2-fwd: sw_if_index %d dst %U src %U bd_index %d",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->bd_index);
+ return s;
+}
+
+l2fwd_main_t l2fwd_main;
+
+static vlib_node_registration_t l2fwd_node;
+
+#define foreach_l2fwd_error \
+_(L2FWD, "L2 forward packets") \
+_(FLOOD, "L2 forward misses") \
+_(HIT, "L2 forward hits") \
+_(BVI_TAGGED, "BVI packet with vlan tag") \
+_(BVI_ETHERTYPE, "BVI packet with unhandled ethertype") \
+_(FILTER_DROP, "Filter Mac Drop") \
+_(REFLECT_DROP, "Reflection Drop")
+
+typedef enum {
+#define _(sym,str) L2FWD_ERROR_##sym,
+ foreach_l2fwd_error
+#undef _
+ L2FWD_N_ERROR,
+} l2fwd_error_t;
+
+static char * l2fwd_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2fwd_error
+#undef _
+};
+
+typedef enum {
+ L2FWD_NEXT_L2_OUTPUT,
+ L2FWD_NEXT_FLOOD,
+ L2FWD_NEXT_DROP,
+ L2FWD_N_NEXT,
+} l2fwd_next_t;
+
+// Forward one packet based on the mac table lookup result
+
+static_always_inline void
+l2fwd_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ l2fwd_main_t * msm,
+ vlib_error_main_t * em,
+ vlib_buffer_t * b0,
+ u32 sw_if_index0,
+ l2fib_entry_result_t * result0,
+ u32 * next0)
+{
+ if (PREDICT_FALSE (result0->raw == ~0)) {
+ // lookup miss, so flood
+ // TODO:replicate packet to each intf in bridge-domain
+ // For now just drop
+ if (vnet_buffer(b0)->l2.feature_bitmap & L2INPUT_FEAT_UU_FLOOD) {
+ *next0 = L2FWD_NEXT_FLOOD;
+ } else {
+ // Flooding is disabled
+ b0->error = node->errors[L2FWD_ERROR_FLOOD];
+ *next0 = L2FWD_NEXT_DROP;
+ }
+
+ } else {
+
+ // lookup hit, forward packet
+#ifdef COUNTERS
+ em->counters[node_counter_base_index + L2FWD_ERROR_HIT] += 1;
+#endif
+
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = result0->fields.sw_if_index;
+ *next0 = L2FWD_NEXT_L2_OUTPUT;
+
+ // perform reflection check
+ if (PREDICT_FALSE (sw_if_index0 == result0->fields.sw_if_index)) {
+ b0->error = node->errors[L2FWD_ERROR_REFLECT_DROP];
+ *next0 = L2FWD_NEXT_DROP;
+
+ // perform filter check
+ } else if (PREDICT_FALSE (result0->fields.filter)) {
+ b0->error = node->errors[L2FWD_ERROR_FILTER_DROP];
+ *next0 = L2FWD_NEXT_DROP;
+
+ // perform BVI check
+ } else if (PREDICT_FALSE (result0->fields.bvi)) {
+ u32 rc;
+ rc = l2_to_bvi (vm,
+ msm->vnet_main,
+ b0,
+ vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ &msm->l3_next,
+ next0);
+
+ if (PREDICT_FALSE(rc)) {
+ if (rc == TO_BVI_ERR_TAGGED) {
+ b0->error = node->errors[L2FWD_ERROR_BVI_TAGGED];
+ *next0 = L2FWD_NEXT_DROP;
+ } else if (rc == TO_BVI_ERR_ETHERTYPE) {
+ b0->error = node->errors[L2FWD_ERROR_BVI_ETHERTYPE];
+ *next0 = L2FWD_NEXT_DROP;
+ }
+ }
+ }
+ }
+}
+
+
+static uword
+l2fwd_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2fwd_next_t next_index;
+ l2fwd_main_t * msm = &l2fwd_main;
+ vlib_node_t *n = vlib_get_node (vm, l2fwd_node.index);
+ CLIB_UNUSED(u32 node_counter_base_index) = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ l2fib_entry_key_t cached_key;
+ l2fib_entry_result_t cached_result;
+
+ // Clear the one-entry cache in case mac table was updated
+ cached_key.raw = ~0;
+ cached_result.raw = ~0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ ethernet_header_t * h0, * h1;
+ l2fib_entry_key_t key0, key1;
+ l2fib_entry_result_t result0, result1;
+ u32 bucket0, bucket1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* RX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2fwd_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2fwd_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->bd_index = vnet_buffer(b1)->l2.bd_index;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ /* process 2 pkts */
+#ifdef COUNTERS
+ em->counters[node_counter_base_index + L2FWD_ERROR_L2FWD] += 2;
+#endif
+ l2fib_lookup_2 (msm->mac_table, &cached_key, &cached_result,
+ h0->dst_address,
+ h1->dst_address,
+ vnet_buffer(b0)->l2.bd_index,
+ vnet_buffer(b1)->l2.bd_index,
+ &key0, // not used
+ &key1, // not used
+ &bucket0, // not used
+ &bucket1, // not used
+ &result0,
+ &result1);
+ l2fwd_process (vm, node, msm, em, b0, sw_if_index0, &result0, &next0);
+ l2fwd_process (vm, node, msm, em, b1, sw_if_index1, &result1, &next1);
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ethernet_header_t * h0;
+ l2fib_entry_key_t key0;
+ l2fib_entry_result_t result0;
+ u32 bucket0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ h0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ l2fwd_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ /* process 1 pkt */
+#ifdef COUNTERS
+ em->counters[node_counter_base_index + L2FWD_ERROR_L2FWD] += 1;
+#endif
+ l2fib_lookup_1 (msm->mac_table, &cached_key, &cached_result,
+ h0->dst_address, vnet_buffer(b0)->l2.bd_index,
+ &key0, // not used
+ &bucket0, // not used
+ &result0);
+ l2fwd_process (vm, node, msm, em, b0, sw_if_index0, &result0, &next0);
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (l2fwd_node,static) = {
+ .function = l2fwd_node_fn,
+ .name = "l2-fwd",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2fwd_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2fwd_error_strings),
+ .error_strings = l2fwd_error_strings,
+
+ .n_next_nodes = L2FWD_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2FWD_NEXT_L2_OUTPUT] = "l2-output",
+ [L2FWD_NEXT_FLOOD] = "l2-flood",
+ [L2FWD_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2fwd_init (vlib_main_t *vm)
+{
+ l2fwd_main_t * mp = &l2fwd_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ /* init the hash table ptr */
+ mp->mac_table = get_mac_table();
+
+ // Initialize the next nodes for each ethertype
+ next_by_ethertype_init (&mp->l3_next);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2fwd_init);
+
+
+// Add the L3 input node for this ethertype to the next nodes structure
+void
+l2fwd_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index)
+{
+ l2fwd_main_t * mp = &l2fwd_main;
+ u32 next_index;
+
+ next_index = vlib_node_add_next (vm,
+ l2fwd_node.index,
+ node_index);
+
+ next_by_ethertype_register (&mp->l3_next, type, next_index);
+}
+
+
+// set subinterface forward enable/disable
+// The CLI format is:
+// set interface l2 forward <interface> [disable]
+static clib_error_t *
+int_fwd (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the interface flag
+ if (l2input_intf_config(sw_if_index)->xconnect) {
+ l2input_intf_bitmap_enable(sw_if_index, L2INPUT_FEAT_XCONNECT, enable);
+ } else {
+ l2input_intf_bitmap_enable(sw_if_index, L2INPUT_FEAT_FWD, enable);
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_fwd_cli, static) = {
+ .path = "set interface l2 forward",
+ .short_help = "set interface l2 forward <interface> [disable]",
+ .function = int_fwd,
+};
diff --git a/vnet/vnet/l2/l2_fwd.h b/vnet/vnet/l2/l2_fwd.h
new file mode 100644
index 00000000000..f08717dfdf8
--- /dev/null
+++ b/vnet/vnet/l2/l2_fwd.h
@@ -0,0 +1,29 @@
+/*
+ * l2_fwd.c : layer 2 forwarding using l2fib
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2fwd_h
+#define included_l2fwd_h
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+
+void
+l2fwd_register_input_type (vlib_main_t * vm,
+ ethernet_type_t type,
+ u32 node_index);
+#endif
diff --git a/vnet/vnet/l2/l2_input.c b/vnet/vnet/l2/l2_input.c
new file mode 100644
index 00000000000..34f8a77184f
--- /dev/null
+++ b/vnet/vnet/l2/l2_input.c
@@ -0,0 +1,963 @@
+/*
+ * l2_input.c : layer 2 input packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/l2_output.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_bvi.h>
+#include <vnet/l2/l2_fib.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+extern clib_error_t *
+ethernet_arp_hw_interface_link_up_down (vnet_main_t * vnm,
+ u32 hw_if_index,
+ u32 flags);
+
+// Feature graph node names
+static char * l2input_feat_names[] = {
+#define _(sym,name) name,
+ foreach_l2input_feat
+#undef _
+};
+
+char **l2input_get_feat_names(void) {
+ return l2input_feat_names;
+}
+
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u32 next_index;
+ u32 sw_if_index;
+} l2input_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2input_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2input_trace_t * t = va_arg (*args, l2input_trace_t *);
+
+ s = format (s, "l2-input: sw_if_index %d dst %U src %U",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src);
+ return s;
+}
+
+l2input_main_t l2input_main;
+
+static vlib_node_registration_t l2input_node;
+
+#define foreach_l2input_error \
+_(L2INPUT, "L2 input packets") \
+_(DROP, "L2 input drops")
+
+typedef enum {
+#define _(sym,str) L2INPUT_ERROR_##sym,
+ foreach_l2input_error
+#undef _
+ L2INPUT_N_ERROR,
+} l2input_error_t;
+
+static char * l2input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2input_error
+#undef _
+};
+
+typedef enum { /* */
+ L2INPUT_NEXT_LEARN,
+ L2INPUT_NEXT_FWD,
+ L2INPUT_NEXT_DROP,
+ L2INPUT_N_NEXT,
+} l2input_next_t;
+
+
+static_always_inline void
+classify_and_dispatch (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 cpu_index,
+ l2input_main_t * msm,
+ vlib_buffer_t * b0,
+ u32 *next0)
+{
+ // Load L2 input feature struct
+ // Load bridge domain struct
+ // Parse ethernet header to determine unicast/mcast/broadcast
+ // take L2 input stat
+ // classify packet as IP/UDP/TCP, control, other
+ // mask feature bitmap
+ // go to first node in bitmap
+ // Later: optimize VTM
+ //
+ // For L2XC,
+ // set tx sw-if-handle
+
+ u8 mcast_dmac;
+ __attribute__((unused)) u8 l2bcast;
+ __attribute__((unused)) u8 l2mcast;
+ __attribute__((unused)) u8 l2_stat_kind;
+ u16 ethertype;
+ u8 protocol;
+ l2_input_config_t *config;
+ l2_bridge_domain_t *bd_config;
+ u16 bd_index0;
+ u32 feature_bitmap;
+ u32 feat_mask;
+ ethernet_header_t * h0;
+ u8 * l3h0;
+ u32 sw_if_index0;
+ u8 bvi_flg = 0;
+
+#define get_u32(addr) ( *((u32 *)(addr)) )
+#define get_u16(addr) ( *((u16 *)(addr)) )
+#define STATS_IF_LAYER2_UCAST_INPUT_CNT 0
+#define STATS_IF_LAYER2_MCAST_INPUT_CNT 1
+#define STATS_IF_LAYER2_BCAST_INPUT_CNT 2
+
+ // Check for from-BVI processing
+ // When we come from ethernet-input, TX is ~0
+ if (PREDICT_FALSE (vnet_buffer(b0)->sw_if_index[VLIB_TX] != ~0)) {
+ // Set up for a from-bvi packet
+ bvi_to_l2 (vm,
+ msm->vnet_main,
+ cpu_index,
+ b0,
+ vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+ bvi_flg = 1;
+ }
+
+ // The RX interface can be changed by bvi_to_l2()
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ h0 = vlib_buffer_get_current (b0);
+ l3h0 = (u8 *)h0 + vnet_buffer(b0)->l2.l2_len;
+
+ // Determine L3 packet type. Only need to check the common types.
+ // Used to filter out features that don't apply to common packets.
+ ethertype = clib_net_to_host_u16(get_u16(l3h0 - 2));
+ if (ethertype == ETHERNET_TYPE_IP4) {
+ protocol = ((ip4_header_t *)l3h0)->protocol;
+ if ((protocol == IP_PROTOCOL_UDP) ||
+ (protocol == IP_PROTOCOL_TCP)) {
+ feat_mask = IP_UDP_TCP_FEAT_MASK;
+ } else {
+ feat_mask = IP4_FEAT_MASK;
+ }
+ } else if (ethertype == ETHERNET_TYPE_IP6) {
+ protocol = ((ip6_header_t *)l3h0)->protocol;
+ // Don't bother checking for extension headers for now
+ if ((protocol == IP_PROTOCOL_UDP) ||
+ (protocol == IP_PROTOCOL_TCP)) {
+ feat_mask = IP_UDP_TCP_FEAT_MASK;
+ } else {
+ feat_mask = IP6_FEAT_MASK;
+ }
+ } else if (ethertype == ETHERNET_TYPE_MPLS_UNICAST) {
+ feat_mask = IP6_FEAT_MASK;
+ } else {
+ // allow all features
+ feat_mask = ~0;
+ }
+
+ // determine layer2 kind for stat and mask
+ mcast_dmac = ethernet_address_cast(h0->dst_address);
+ l2bcast = 0;
+ l2mcast = 0;
+ l2_stat_kind = STATS_IF_LAYER2_UCAST_INPUT_CNT;
+ if (PREDICT_FALSE (mcast_dmac)) {
+ u32 *dsthi = (u32 *) &h0->dst_address[0];
+ u32 *dstlo = (u32 *) &h0->dst_address[2];
+
+ // Disable bridge forwarding (flooding will execute instead if not xconnect)
+ feat_mask &= ~(L2INPUT_FEAT_FWD | L2INPUT_FEAT_UU_FLOOD);
+ if (ethertype != ETHERNET_TYPE_ARP) // Disable ARP-term for non-ARP packet
+ feat_mask &= ~(L2INPUT_FEAT_ARP_TERM);
+
+ // dest mac is multicast or broadcast
+ if ((*dstlo == 0xFFFFFFFF) && (*dsthi == 0xFFFFFFFF)) {
+ // dest mac == FF:FF:FF:FF:FF:FF
+ l2_stat_kind = STATS_IF_LAYER2_BCAST_INPUT_CNT;
+ l2bcast=1;
+ } else {
+ l2_stat_kind = STATS_IF_LAYER2_MCAST_INPUT_CNT;
+ l2mcast=1;
+ }
+ }
+ // TODO: take l2 stat
+
+ // Get config for the input interface
+ config = vec_elt_at_index(msm->configs, sw_if_index0);
+
+ // Save split horizon group, use 0 for BVI to make sure not dropped
+ vnet_buffer(b0)->l2.shg = bvi_flg ? 0 : config->shg;
+
+ if (config->xconnect) {
+ // Set the output interface
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = config->output_sw_if_index;
+
+ } else {
+
+ // Do bridge-domain processing
+ bd_index0 = config->bd_index;
+ // save BD ID for next feature graph nodes
+ vnet_buffer(b0)->l2.bd_index = bd_index0;
+
+ // Get config for the bridge domain interface
+ bd_config = vec_elt_at_index(msm->bd_configs, bd_index0);
+
+ // Process bridge domain feature enables.
+ // To perform learning/flooding/forwarding, the corresponding bit
+ // must be enabled in both the input interface config and in the
+ // bridge domain config. In the bd_bitmap, bits for features other
+ // than learning/flooding/forwarding should always be set.
+ feat_mask = feat_mask & bd_config->feature_bitmap;
+ }
+
+ // mask out features from bitmap using packet type and bd config
+ feature_bitmap = config->feature_bitmap & feat_mask;
+
+ // save for next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap;
+
+ // Determine the next node
+ *next0 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ feature_bitmap);
+}
+
+
+static uword
+l2input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2input_next_t next_index;
+ l2input_main_t * msm = &l2input_main;
+ vlib_node_t *n = vlib_get_node (vm, l2input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 6 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3, * p4 , * p5;
+ u32 sw_if_index2, sw_if_index3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+
+ // Prefetch the buffer header and packet for the N+2 loop iteration
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+
+ CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE);
+
+ // Prefetch the input config for the N+1 loop iteration
+ // This depends on the buffer header above
+ sw_if_index2 = vnet_buffer(p2)->sw_if_index[VLIB_RX];
+ sw_if_index3 = vnet_buffer(p3)->sw_if_index[VLIB_RX];
+ CLIB_PREFETCH (&msm->configs[sw_if_index2], CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (&msm->configs[sw_if_index3], CLIB_CACHE_LINE_BYTES, LOAD);
+
+ // Don't bother prefetching the bridge-domain config (which
+ // depends on the input config above). Only a small number of
+ // bridge domains are expected. Plus the structure is small
+ // and several fit in a cache line.
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) {
+ /* RX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ if (b0->flags & VLIB_BUFFER_IS_TRACED) {
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ l2input_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED) {
+ ethernet_header_t * h1 = vlib_buffer_get_current (b1);
+ l2input_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ em->counters[node_counter_base_index + L2INPUT_ERROR_L2INPUT] += 2;
+
+ classify_and_dispatch (vm,
+ node,
+ cpu_index,
+ msm,
+ b0,
+ &next0);
+
+ classify_and_dispatch (vm,
+ node,
+ cpu_index,
+ msm,
+ b1,
+ &next1);
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ l2input_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ em->counters[node_counter_base_index + L2INPUT_ERROR_L2INPUT] += 1;
+
+ classify_and_dispatch (vm,
+ node,
+ cpu_index,
+ msm,
+ b0,
+ &next0);
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2input_node,static) = {
+ .function = l2input_node_fn,
+ .name = "l2-input",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2input_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2input_error_strings),
+ .error_strings = l2input_error_strings,
+
+ .n_next_nodes = L2INPUT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2INPUT_NEXT_LEARN] = "l2-learn",
+ [L2INPUT_NEXT_FWD] = "l2-fwd",
+ [L2INPUT_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2input_init (vlib_main_t *vm)
+{
+ l2input_main_t * mp = &l2input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Get packets RX'd from L2 interfaces
+ ethernet_register_l2_input (vm, l2input_node.index);
+
+ // Create the config vector
+ vec_validate(mp->configs, 100);
+ // create 100 sw interface entries and zero them
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2input_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ mp->feat_next_node_index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2input_init);
+
+
+// Get a pointer to the config for the given interface
+l2_input_config_t * l2input_intf_config (u32 sw_if_index)
+{
+ l2input_main_t * mp = &l2input_main;
+
+ vec_validate(mp->configs, sw_if_index);
+ return vec_elt_at_index(mp->configs, sw_if_index);
+}
+
+// Enable (or disable) the feature in the bitmap for the given interface
+u32 l2input_intf_bitmap_enable (u32 sw_if_index,
+ u32 feature_bitmap,
+ u32 enable)
+{
+ l2input_main_t * mp = &l2input_main;
+ l2_input_config_t *config;
+
+ vec_validate(mp->configs, sw_if_index);
+ config = vec_elt_at_index(mp->configs, sw_if_index);
+
+ if (enable) {
+ config->feature_bitmap |= feature_bitmap;
+ } else {
+ config->feature_bitmap &= ~feature_bitmap;
+ }
+
+ return config->feature_bitmap;
+}
+
+
+
+// Set the subinterface to run in l2 or l3 mode.
+// for L3 mode, just the sw_if_index is specified
+// for bridged mode, the bd id and bvi flag are also specified
+// for xconnect mode, the peer sw_if_index is also specified
+// Return 0 if ok, or non-0 if there was an error
+
+u32 set_int_l2_mode (vlib_main_t * vm,
+ vnet_main_t * vnet_main,
+ u32 mode,
+ u32 sw_if_index,
+ u32 bd_index, // for bridged interface
+ u32 bvi, // the bridged interface is the BVI
+ u32 shg, // the bridged interface's split horizon group
+ u32 xc_sw_if_index) // peer interface for xconnect
+{
+ l2input_main_t * mp = &l2input_main;
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_hw_interface_t * hi;
+ l2_output_config_t * out_config;
+ l2_input_config_t * config;
+ l2_bridge_domain_t * bd_config;
+ l2_flood_member_t member;
+ u64 mac;
+ i32 l2_if_adjust = 0;
+
+ hi = vnet_get_sup_hw_interface (vnet_main, sw_if_index);
+
+ vec_validate(mp->configs, sw_if_index);
+ config = vec_elt_at_index(mp->configs, sw_if_index);
+
+ if (config->bridge) {
+ // Interface is already in bridge mode. Undo the existing config.
+ bd_config = vec_elt_at_index(mp->bd_configs, config->bd_index);
+
+ // remove interface from flood vector
+ bd_remove_member (bd_config, sw_if_index);
+
+ // undo any BVI-related config
+ if (bd_config->bvi_sw_if_index == sw_if_index) {
+ bd_config->bvi_sw_if_index = ~0;
+ config->bvi = 0;
+
+ // restore output node
+ hi->output_node_index = bd_config->saved_bvi_output_node_index;
+
+ // delete the l2fib entry for the bvi interface
+ mac = *((u64 *)hi->hw_address);
+ l2fib_del_entry (mac, config->bd_index);
+ }
+ l2_if_adjust--;
+ } else if (config->xconnect) {
+ l2_if_adjust--;
+ }
+
+ // Initialize the l2-input configuration for the interface
+ if (mode == MODE_L3) {
+ config->xconnect = 0;
+ config->bridge = 0;
+ config->shg = 0;
+ config->bd_index = 0;
+ config->feature_bitmap = L2INPUT_FEAT_DROP;
+ } else if (mode == MODE_L2_CLASSIFY) {
+ config->xconnect = 1;
+ config->bridge = 0;
+ config->output_sw_if_index = xc_sw_if_index;
+
+ // Make sure last-chance drop is configured
+ config->feature_bitmap |= L2INPUT_FEAT_DROP | L2INPUT_FEAT_CLASSIFY;
+
+ // Make sure bridging features are disabled
+ config->feature_bitmap &=
+ ~(L2INPUT_FEAT_LEARN | L2INPUT_FEAT_FWD | L2INPUT_FEAT_FLOOD);
+ shg = 0; // not used in xconnect
+
+ // Insure all packets go to ethernet-input
+ ethernet_set_rx_redirect (vnet_main, hi, 1);
+ } else {
+
+ if (mode == MODE_L2_BRIDGE) {
+ /*
+ * Remove a check that the interface must be an Ethernet.
+ * Specifically so we can bridge to L3 tunnel interfaces.
+ * Here's the check:
+ * if (hi->hw_class_index != ethernet_hw_interface_class.index)
+ *
+ */
+ if (!hi)
+ return MODE_ERROR_ETH; // non-ethernet
+
+ config->xconnect = 0;
+ config->bridge = 1;
+ config->bd_index = bd_index;
+
+ // Enable forwarding, flooding, learning and ARP termination by default
+ // (note that ARP term is disabled on BD feature bitmap by default)
+ config->feature_bitmap |= L2INPUT_FEAT_FWD | L2INPUT_FEAT_UU_FLOOD |
+ L2INPUT_FEAT_FLOOD | L2INPUT_FEAT_LEARN | L2INPUT_FEAT_ARP_TERM;
+
+ // Make sure last-chance drop is configured
+ config->feature_bitmap |= L2INPUT_FEAT_DROP;
+
+ // Make sure xconnect is disabled
+ config->feature_bitmap &= ~L2INPUT_FEAT_XCONNECT;
+
+ // Set up bridge domain
+ vec_validate(mp->bd_configs, bd_index);
+ bd_config = vec_elt_at_index(mp->bd_configs, bd_index);
+ bd_validate (bd_config);
+
+ // TODO: think: add l2fib entry even for non-bvi interface?
+
+ // Do BVI interface initializations
+ if (bvi) {
+ // insure BD has no bvi interface (or replace that one with this??)
+ if (bd_config->bvi_sw_if_index != ~0) {
+ return MODE_ERROR_BVI_DEF; // bd already has a bvi interface
+ }
+ bd_config->bvi_sw_if_index = sw_if_index;
+ config->bvi = 1;
+
+ // make BVI outputs go to l2-input
+ bd_config->saved_bvi_output_node_index = hi->output_node_index;
+ hi->output_node_index = l2input_node.index;
+
+ // create the l2fib entry for the bvi interface
+ mac = *((u64 *)hi->hw_address);
+ l2fib_add_entry (mac, bd_index, sw_if_index, 1, 0, 1); // static + bvi
+
+ // Disable learning by default. no use since l2fib entry is static.
+ config->feature_bitmap &= ~L2INPUT_FEAT_LEARN;
+
+ // Add BVI to arp_input_next_index_by_hw_if_index table so arp-input
+ // node can send out ARP response via BVI to BD
+ ethernet_arp_hw_interface_link_up_down(vnet_main, hi->hw_if_index, 0);
+
+ }
+
+ // Add interface to bridge-domain flood vector
+ member.sw_if_index = sw_if_index;
+ member.flags = bvi ? L2_FLOOD_MEMBER_BVI : L2_FLOOD_MEMBER_NORMAL;
+ member.shg = shg;
+ bd_add_member (bd_config, &member);
+
+ } else {
+ config->xconnect = 1;
+ config->bridge = 0;
+ config->output_sw_if_index = xc_sw_if_index;
+
+ // Make sure last-chance drop is configured
+ config->feature_bitmap |= L2INPUT_FEAT_DROP;
+
+ // Make sure bridging features are disabled
+ config->feature_bitmap &= ~(L2INPUT_FEAT_LEARN | L2INPUT_FEAT_FWD | L2INPUT_FEAT_FLOOD);
+
+ config->feature_bitmap |= L2INPUT_FEAT_XCONNECT;
+ shg = 0; // not used in xconnect
+ }
+
+ // set up split-horizon group
+ config->shg = shg;
+ out_config = l2output_intf_config (sw_if_index);
+ out_config->shg = shg;
+
+ // Test: remove this when non-IP features can be configured.
+ // Enable a non-IP feature to test IP feature masking
+ // config->feature_bitmap |= L2INPUT_FEAT_CTRL_PKT;
+
+ l2_if_adjust++;
+ }
+
+ // Adjust count of L2 interfaces
+ hi->l2_if_count += l2_if_adjust;
+
+ if (hi->hw_class_index == ethernet_hw_interface_class.index) {
+ if ((hi->l2_if_count == 1) && (l2_if_adjust == 1)) {
+ // Just added first L2 interface on this port
+
+ // Set promiscuous mode on the l2 interface
+ ethernet_set_flags (vnet_main, hi->hw_if_index,
+ ETHERNET_INTERFACE_FLAG_ACCEPT_ALL);
+
+ // Insure all packets go to ethernet-input
+ ethernet_set_rx_redirect (vnet_main, hi, 1);
+
+ } else if ((hi->l2_if_count == 0) && (l2_if_adjust == -1)) {
+ // Just removed only L2 subinterface on this port
+
+ // Disable promiscuous mode on the l2 interface
+ ethernet_set_flags (vnet_main, hi->hw_if_index, 0);
+
+ // Allow ip packets to go directly to ip4-input etc
+ ethernet_set_rx_redirect (vnet_main, hi, 0);
+ }
+ }
+
+ // Set up the L2/L3 flag in the interface parsing tables
+ ethernet_sw_interface_set_l2_mode(vnm, sw_if_index, (mode!=MODE_L3));
+
+ return 0;
+}
+
+// set subinterface in bridging mode with a bridge-domain ID
+// The CLI format is:
+// set interface l2 bridge <interface> <bd> [bvi] [split-horizon-group]
+static clib_error_t *
+int_l2_bridge (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 bd_index, bd_id;
+ u32 sw_if_index;
+ u32 bvi;
+ u32 rc;
+ u32 shg;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (!unformat (input, "%d", &bd_id)) {
+ error = clib_error_return (0, "expected bridge domain ID `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ bd_index = bd_find_or_add_bd_index (&bd_main, bd_id);
+
+ // optional bvi
+ bvi = unformat (input, "bvi");
+
+ // optional split horizon group
+ shg = 0;
+ (void) unformat (input, "%d", &shg);
+
+ // set the interface mode
+ if ((rc = set_int_l2_mode(vm, vnm, MODE_L2_BRIDGE, sw_if_index, bd_index, bvi, shg, 0))) {
+ if (rc == MODE_ERROR_ETH) {
+ error = clib_error_return (0, "bridged interface must be ethernet",
+ format_unformat_error, input);
+ } else if (rc == MODE_ERROR_BVI_DEF) {
+ error = clib_error_return (0, "bridge-domain already has a bvi interface",
+ format_unformat_error, input);
+ } else {
+ error = clib_error_return (0, "invalid configuration for interface",
+ format_unformat_error, input);
+ }
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l2_bridge_cli, static) = {
+ .path = "set interface l2 bridge",
+ .short_help = "set interface to L2 bridging mode in <bridge-domain ID> [bvi] [shg]",
+ .function = int_l2_bridge,
+};
+
+// set subinterface in xconnect mode with another interface
+// The CLI format is:
+// set interface l2 xconnect <interface> <peer interface>
+static clib_error_t *
+int_l2_xc (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 xc_sw_if_index;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &xc_sw_if_index))
+ {
+ error = clib_error_return (0, "unknown peer interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ // set the interface mode
+ if (set_int_l2_mode(vm, vnm, MODE_L2_XC, sw_if_index, 0, 0, 0, xc_sw_if_index)) {
+ error = clib_error_return (0, "invalid configuration for interface",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l2_xc_cli, static) = {
+ .path = "set interface l2 xconnect",
+ .short_help = "set interface to L2 cross-connect mode with <peer interface>",
+ .function = int_l2_xc,
+};
+
+// set subinterface in L3 mode
+// The CLI format is:
+// set interface l3 <interface>
+static clib_error_t *
+int_l3 (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ // set the interface mode
+ if (set_int_l2_mode(vm, vnm, MODE_L3, sw_if_index, 0, 0, 0, 0)) {
+ error = clib_error_return (0, "invalid configuration for interface",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l3_cli, static) = {
+ .path = "set interface l3",
+ .short_help = "set interface to L3 mode",
+ .function = int_l3,
+};
+
+// The CLI format is:
+// show mode [<if-name1> <if-name2> ...]
+static clib_error_t *
+show_int_mode (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ char * mode;
+ u8 * args;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * si, * sis = 0;
+ l2input_main_t * mp = &l2input_main;
+ l2_input_config_t * config;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ u32 sw_if_index;
+
+ /* See if user wants to show specific interface */
+ if (unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ si = pool_elt_at_index (im->sw_interfaces, sw_if_index);
+ vec_add1 (sis, si[0]);
+ }
+ else
+ {
+ error = clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ }
+
+ if (vec_len (sis) == 0) /* Get all interfaces */
+ {
+ /* Gather interfaces. */
+ sis = vec_new (vnet_sw_interface_t, pool_elts (im->sw_interfaces));
+ _vec_len (sis) = 0;
+ pool_foreach (si, im->sw_interfaces, ({ vec_add1 (sis, si[0]); }));
+ }
+
+ vec_foreach (si, sis)
+ {
+ vec_validate(mp->configs, si->sw_if_index);
+ config = vec_elt_at_index(mp->configs, si->sw_if_index);
+ if (config->bridge) {
+ u32 bd_id;
+ mode = "l2 bridge";
+ bd_id = l2input_main.bd_configs[config->bd_index].bd_id;
+
+ args = format (0, "bd_id %d%s%d", bd_id,
+ config->bvi ? " bvi shg " : " shg ", config->shg);
+ } else if (config->xconnect) {
+ mode = "l2 xconnect";
+ args = format (0, "%U",
+ format_vnet_sw_if_index_name,
+ vnm, config->output_sw_if_index);
+ } else {
+ mode = "l3";
+ args = format (0, " ");
+ }
+ vlib_cli_output (vm, "%s %U %v\n",
+ mode,
+ format_vnet_sw_if_index_name,
+ vnm, si->sw_if_index,
+ args);
+ vec_free (args);
+ }
+
+done:
+ vec_free (sis);
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (show_l2_mode, static) = {
+ .path = "show mode",
+ .short_help = "show mode [<if-name1> <if-name2> ...]",
+ .function = show_int_mode,
+};
+
+#define foreach_l2_init_function \
+_(feat_bitmap_drop_init) \
+_(l2fib_init) \
+_(l2_classify_init) \
+_(l2bd_init) \
+_(l2fwd_init) \
+_(l2_inacl_init) \
+_(l2input_init) \
+_(l2_vtr_init) \
+_(l2_invtr_init) \
+_(l2_efp_filter_init) \
+_(l2learn_init) \
+_(l2flood_init) \
+_(l2_outacl_init) \
+_(l2output_init) \
+_(l2_patch_init) \
+_(l2_xcrw_init)
+
+clib_error_t *l2_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+
+#define _(a) do { \
+ if ((error = vlib_call_init_function (vm, a))) return error; } \
+while (0);
+ foreach_l2_init_function;
+#undef _
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_init);
diff --git a/vnet/vnet/l2/l2_input.h b/vnet/vnet/l2/l2_input.h
new file mode 100644
index 00000000000..e650162b593
--- /dev/null
+++ b/vnet/vnet/l2/l2_input.h
@@ -0,0 +1,279 @@
+/*
+ * l2_input.h : layer 2 input packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_l2_input_h
+#define included_vnet_l2_input_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/l2/l2_bd.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip.h>
+
+// Per-subinterface L2 feature configuration
+
+typedef struct {
+
+ union {
+ u16 bd_index; // bridge domain id
+ u32 output_sw_if_index; // for xconnect
+ };
+
+ // Interface mode. If both are 0, this interface is in L3 mode
+ u8 xconnect;
+ u8 bridge;
+
+ // this is the bvi interface for the bridge-domain
+ u8 bvi;
+
+ // config for which input features are configured on this interface
+ u32 feature_bitmap;
+
+ // some of these flags are also in the feature bitmap
+ u8 learn_enable;
+ u8 fwd_enable;
+ u8 flood_enable;
+
+ // split horizon group
+ u8 shg;
+
+} l2_input_config_t;
+
+
+typedef struct {
+
+ // Next nodes for the feature bitmap
+ u32 feat_next_node_index[32];
+
+ /* config vector indexed by sw_if_index */
+ l2_input_config_t *configs;
+
+ /* bridge domain config vector indexed by BD ID */
+ l2_bridge_domain_t *bd_configs;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2input_main_t;
+
+extern l2input_main_t l2input_main;
+
+static_always_inline l2_bridge_domain_t *
+l2input_bd_config_from_index (l2input_main_t * l2im, u32 bd_index)
+{
+ l2_bridge_domain_t * bd_config;
+
+ bd_config = vec_elt_at_index (l2im->bd_configs, bd_index);
+ return bd_is_valid (bd_config) ? bd_config : NULL;
+}
+
+// L2 input features
+
+// Mappings from feature ID to graph node name
+#define foreach_l2input_feat \
+ _(DROP, "feature-bitmap-drop") \
+ _(CLASSIFY, "l2-classify") \
+ _(XCONNECT, "l2-output") \
+ _(IPIW, "feature-bitmap-drop") \
+ _(FLOOD, "l2-flood") \
+ _(ARP_TERM, "arp-term-l2bd") \
+ _(UU_FLOOD, "l2-flood") \
+ _(FWD, "l2-fwd") \
+ _(LEARN, "l2-learn") \
+ _(VTR, "l2-input-vtr") \
+ _(VPATH, "vpath-input-l2") \
+ _(CTRL_PKT, "feature-bitmap-drop") \
+ _(L2PT, "feature-bitmap-drop") \
+ _(IGMP_SNOOP, "feature-bitmap-drop") \
+ _(MLD_SNOOP, "feature-bitmap-drop") \
+ _(DHCP_SNOOP, "feature-bitmap-drop") \
+ _(DAI, "feature-bitmap-drop") \
+ _(IPSG, "feature-bitmap-drop") \
+ _(ACL, "l2-input-acl") \
+ _(QOS, "feature-bitmap-drop") \
+ _(CFM, "feature-bitmap-drop") \
+ _(SPAN, "feature-bitmap-drop")
+
+// Feature bitmap positions
+typedef enum {
+#define _(sym,str) L2INPUT_FEAT_##sym##_BIT,
+ foreach_l2input_feat
+#undef _
+ L2INPUT_N_FEAT,
+} l2input_feat_t;
+
+// Feature bit masks
+typedef enum {
+#define _(sym,str) L2INPUT_FEAT_##sym = (1<<L2INPUT_FEAT_##sym##_BIT),
+ foreach_l2input_feat
+#undef _
+} l2input_feat_masks_t;
+
+// Return an array of strings containing graph node names of each feature
+char **l2input_get_feat_names(void);
+
+
+static_always_inline u8 bd_feature_flood (l2_bridge_domain_t * bd_config)
+{
+ return ((bd_config->feature_bitmap & L2INPUT_FEAT_FLOOD) ==
+ L2INPUT_FEAT_FLOOD);
+}
+
+static_always_inline u8 bd_feature_uu_flood (l2_bridge_domain_t * bd_config)
+{
+ return ((bd_config->feature_bitmap & L2INPUT_FEAT_UU_FLOOD) ==
+ L2INPUT_FEAT_UU_FLOOD);
+}
+
+static_always_inline u8 bd_feature_forward (l2_bridge_domain_t * bd_config)
+{
+ return ((bd_config->feature_bitmap & L2INPUT_FEAT_FWD) ==
+ L2INPUT_FEAT_FWD);
+}
+
+static_always_inline u8 bd_feature_learn (l2_bridge_domain_t * bd_config)
+{
+ return ((bd_config->feature_bitmap & L2INPUT_FEAT_LEARN) ==
+ L2INPUT_FEAT_LEARN);
+}
+
+static_always_inline u8 bd_feature_arp_term (l2_bridge_domain_t * bd_config)
+{
+ return ((bd_config->feature_bitmap & L2INPUT_FEAT_ARP_TERM) ==
+ L2INPUT_FEAT_ARP_TERM);
+}
+
+// Masks for eliminating features that do not apply to a packet
+
+#define IP4_FEAT_MASK ~(L2INPUT_FEAT_CTRL_PKT | \
+ L2INPUT_FEAT_MLD_SNOOP | \
+ L2INPUT_FEAT_L2PT | \
+ L2INPUT_FEAT_CFM | \
+ L2INPUT_FEAT_DAI)
+
+#define IP6_FEAT_MASK ~(L2INPUT_FEAT_CTRL_PKT | \
+ L2INPUT_FEAT_IGMP_SNOOP | \
+ L2INPUT_FEAT_L2PT | \
+ L2INPUT_FEAT_CFM | \
+ L2INPUT_FEAT_DAI)
+
+#define IP_UDP_TCP_FEAT_MASK ~(L2INPUT_FEAT_CTRL_PKT | \
+ L2INPUT_FEAT_L2PT | \
+ L2INPUT_FEAT_IGMP_SNOOP | \
+ L2INPUT_FEAT_MLD_SNOOP | \
+ L2INPUT_FEAT_DHCP_SNOOP | \
+ L2INPUT_FEAT_CFM | \
+ L2INPUT_FEAT_DAI)
+
+#define MPLS_FEAT_MASK ~(L2INPUT_FEAT_CTRL_PKT | \
+ L2INPUT_FEAT_L2PT | \
+ L2INPUT_FEAT_IGMP_SNOOP | \
+ L2INPUT_FEAT_MLD_SNOOP | \
+ L2INPUT_FEAT_DHCP_SNOOP | \
+ L2INPUT_FEAT_CFM | \
+ L2INPUT_FEAT_DAI)
+
+
+// Get a pointer to the config for the given interface
+l2_input_config_t * l2input_intf_config (u32 sw_if_index);
+
+// Enable (or disable) the feature in the bitmap for the given interface
+u32 l2input_intf_bitmap_enable (u32 sw_if_index,
+ u32 feature_bitmap,
+ u32 enable);
+
+
+#define MODE_L3 0
+#define MODE_L2_BRIDGE 1
+#define MODE_L2_XC 2
+#define MODE_L2_CLASSIFY 3
+
+#define MODE_ERROR_ETH 1
+#define MODE_ERROR_BVI_DEF 2
+
+u32 set_int_l2_mode (vlib_main_t * vm,
+ vnet_main_t * vnet_main,
+ u32 mode,
+ u32 sw_if_index,
+ u32 bd_index,
+ u32 bvi,
+ u32 shg,
+ u32 xc_sw_if_index);
+
+static inline void
+vnet_update_l2_len (vlib_buffer_t * b)
+{
+ ethernet_header_t * eth;
+ u16 ethertype;
+
+ /* point at currrent l2 hdr */
+ eth = vlib_buffer_get_current (b);
+
+ /*
+ * l2-output pays no attention to this
+ * but the tag push/pop code on an l2 subif needs it.
+ *
+ * Determine l2 header len, check for up to 2 vlans
+ */
+ vnet_buffer(b)->l2.l2_len = sizeof(ethernet_header_t);
+ ethertype = clib_net_to_host_u16(eth->type);
+ if ((ethertype == ETHERNET_TYPE_VLAN) ||
+ (ethertype == ETHERNET_TYPE_DOT1AD) ||
+ (ethertype == ETHERNET_TYPE_VLAN_9100) ||
+ (ethertype == ETHERNET_TYPE_VLAN_9200)) {
+ ethernet_vlan_header_t * vlan;
+ vnet_buffer(b)->l2.l2_len += sizeof (*vlan);
+ vlan = (void *) (eth+1);
+ ethertype = clib_net_to_host_u16 (vlan->type);
+ if (ethertype == ETHERNET_TYPE_VLAN) {
+ vnet_buffer(b)->l2.l2_len += sizeof (*vlan);
+ }
+ }
+}
+
+/*
+ * Compute flow hash of an ethernet packet, use 5-tuple hash if L3 packet
+ * is ip4 or ip6. Otherwise hash on smac/dmac/etype.
+ * The vlib buffer current pointer is expected to be at ethernet header
+ * and vnet l2.l2_len is exppected to be setup already.
+ */
+static inline u32 vnet_l2_compute_flow_hash (vlib_buffer_t *b)
+{
+ ethernet_header_t * eh = vlib_buffer_get_current(b);
+ u8 * l3h = (u8 *)eh + vnet_buffer(b)->l2.l2_len;
+ u16 ethertype = clib_net_to_host_u16(*(u16 *)(l3h - 2));
+
+ if (ethertype == ETHERNET_TYPE_IP4)
+ return ip4_compute_flow_hash((ip4_header_t *) l3h, IP_FLOW_HASH_DEFAULT);
+ else if (ethertype == ETHERNET_TYPE_IP6)
+ return ip6_compute_flow_hash((ip6_header_t *) l3h, IP_FLOW_HASH_DEFAULT);
+ else
+ {
+ u32 a, b, c;
+ u32 * ap = (u32 *) &eh->dst_address[2];
+ u32 * bp = (u32 *) &eh->src_address[2];
+ a = * ap;
+ b = * bp;
+ c = ethertype;
+ hash_v3_mix32 (a, b, c);
+ hash_v3_finalize32 (a, b, c);
+ return c;
+ }
+}
+
+#endif
+
diff --git a/vnet/vnet/l2/l2_input_acl.c b/vnet/vnet/l2/l2_input_acl.c
new file mode 100644
index 00000000000..77fa8944e9f
--- /dev/null
+++ b/vnet/vnet/l2/l2_input_acl.c
@@ -0,0 +1,427 @@
+/*
+ * l2_input_acl.c : layer 2 input acl processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+#include <vnet/classify/vnet_classify.h>
+#include <vnet/classify/input_acl.h>
+
+typedef struct {
+
+ // Next nodes for each feature
+ u32 feat_next_node_index[32];
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_inacl_main_t;
+
+typedef struct {
+ u32 sw_if_index;
+ u32 next_index;
+ u32 table_index;
+ u32 offset;
+} l2_inacl_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_inacl_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_inacl_trace_t * t = va_arg (*args, l2_inacl_trace_t *);
+
+ s = format (s, "INACL: sw_if_index %d, next_index %d, table %d, offset %d",
+ t->sw_if_index, t->next_index, t->table_index, t->offset);
+ return s;
+}
+
+l2_inacl_main_t l2_inacl_main;
+
+static vlib_node_registration_t l2_inacl_node;
+
+#define foreach_l2_inacl_error \
+_(NONE, "valid input ACL packets") \
+_(MISS, "input ACL misses") \
+_(HIT, "input ACL hits") \
+_(CHAIN_HIT, "input ACL hits after chain walk") \
+_(TABLE_MISS, "input ACL table-miss drops") \
+_(SESSION_DENY, "input ACL session deny drops")
+
+
+typedef enum {
+#define _(sym,str) L2_INACL_ERROR_##sym,
+ foreach_l2_inacl_error
+#undef _
+ L2_INACL_N_ERROR,
+} l2_inacl_error_t;
+
+static char * l2_inacl_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_inacl_error
+#undef _
+};
+
+static uword
+l2_inacl_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ acl_next_index_t next_index;
+ l2_inacl_main_t * msm = &l2_inacl_main;
+ input_acl_main_t * am = &input_acl_main;
+ vnet_classify_main_t * vcm = am->vnet_classify_main;
+ input_acl_table_id_t tid = INPUT_ACL_TABLE_L2;
+ f64 now = vlib_time_now (vm);
+ u32 hits = 0;
+ u32 misses = 0;
+ u32 chain_hits = 0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ /* First pass: compute hashes */
+ while (n_left_from > 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, bi1;
+ u8 * h0, * h1;
+ u32 sw_if_index0, sw_if_index1;
+ u32 table_index0, table_index1;
+ vnet_classify_table_t * t0, * t1;
+
+ /* prefetch next iteration */
+ {
+ vlib_buffer_t * p1, * p2;
+
+ p1 = vlib_get_buffer (vm, from[1]);
+ p2 = vlib_get_buffer (vm, from[2]);
+
+ vlib_prefetch_buffer_header (p1, STORE);
+ CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE);
+ vlib_prefetch_buffer_header (p2, STORE);
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ bi1 = from[1];
+ b1 = vlib_get_buffer (vm, bi1);
+ h1 = b1->data;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0];
+
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+ table_index1 = am->classify_table_index_by_sw_if_index[tid][sw_if_index1];
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ t1 = pool_elt_at_index (vcm->tables, table_index1);
+
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ vnet_buffer(b1)->l2_classify.hash =
+ vnet_classify_hash_packet (t1, (u8 *) h1);
+
+ vnet_classify_prefetch_bucket (t1, vnet_buffer(b1)->l2_classify.hash);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+
+ vnet_buffer(b1)->l2_classify.table_index = table_index1;
+
+ from += 2;
+ n_left_from -= 2;
+ }
+
+ while (n_left_from > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0;
+ u8 * h0;
+ u32 sw_if_index0;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+
+ bi0 = from[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0];
+
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+ vnet_buffer(b0)->l2_classify.hash =
+ vnet_classify_hash_packet (t0, (u8 *) h0);
+
+ vnet_buffer(b0)->l2_classify.table_index = table_index0;
+ vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash);
+
+ from++;
+ n_left_from--;
+ }
+
+ next_index = node->cached_next_index;
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ /* Not enough load/store slots to dual loop... */
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = ACL_NEXT_INDEX_DENY;
+ u32 table_index0;
+ vnet_classify_table_t * t0;
+ vnet_classify_entry_t * e0;
+ u64 hash0;
+ u8 * h0;
+ u8 error0;
+
+ /* Stride 3 seems to work best */
+ if (PREDICT_TRUE (n_left_from > 3))
+ {
+ vlib_buffer_t * p1 = vlib_get_buffer(vm, from[3]);
+ vnet_classify_table_t * tp1;
+ u32 table_index1;
+ u64 phash1;
+
+ table_index1 = vnet_buffer(p1)->l2_classify.table_index;
+
+ if (PREDICT_TRUE (table_index1 != ~0))
+ {
+ tp1 = pool_elt_at_index (vcm->tables, table_index1);
+ phash1 = vnet_buffer(p1)->l2_classify.hash;
+ vnet_classify_prefetch_entry (tp1, phash1);
+ }
+ }
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = b0->data;
+ table_index0 = vnet_buffer(b0)->l2_classify.table_index;
+ e0 = 0;
+ t0 = 0;
+
+ /* Feature bitmap update */
+ vnet_buffer(b0)->l2.feature_bitmap &= ~L2INPUT_FEAT_ACL;
+
+ /* Determine the next node */
+ next0 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ vnet_buffer(b0)->l2.feature_bitmap);
+
+ if (PREDICT_TRUE(table_index0 != ~0))
+ {
+ hash0 = vnet_buffer(b0)->l2_classify.hash;
+ t0 = pool_elt_at_index (vcm->tables, table_index0);
+
+ e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0,
+ now);
+ if (e0)
+ {
+ vlib_buffer_advance (b0, e0->advance);
+
+ next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)?
+ e0->next_index:next0;
+
+ hits++;
+
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ L2_INACL_ERROR_SESSION_DENY:L2_INACL_ERROR_NONE;
+ b0->error = node->errors[error0];
+ }
+ else
+ {
+ while (1)
+ {
+ if (PREDICT_TRUE(t0->next_table_index != ~0))
+ t0 = pool_elt_at_index (vcm->tables,
+ t0->next_table_index);
+ else
+ {
+ next0 = (t0->miss_next_index < ACL_NEXT_INDEX_N_NEXT)?
+ t0->miss_next_index:next0;
+
+ misses++;
+
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ L2_INACL_ERROR_TABLE_MISS:L2_INACL_ERROR_NONE;
+ b0->error = node->errors[error0];
+ break;
+ }
+
+ hash0 = vnet_classify_hash_packet (t0, (u8 *) h0);
+ e0 = vnet_classify_find_entry
+ (t0, (u8 *) h0, hash0, now);
+ if (e0)
+ {
+ vlib_buffer_advance (b0, e0->advance);
+ next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)?
+ e0->next_index:next0;
+ hits++;
+ chain_hits++;
+
+ error0 = (next0 == ACL_NEXT_INDEX_DENY)?
+ L2_INACL_ERROR_SESSION_DENY:L2_INACL_ERROR_NONE;
+ b0->error = node->errors[error0];
+ break;
+ }
+ }
+ }
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2_inacl_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ t->next_index = next0;
+ t->table_index = t0 ? t0 - vcm->tables : ~0;
+ t->offset = e0 ? vnet_classify_get_offset (t0, e0): ~0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_INACL_ERROR_MISS,
+ misses);
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_INACL_ERROR_HIT,
+ hits);
+ vlib_node_increment_counter (vm, node->node_index,
+ L2_INACL_ERROR_CHAIN_HIT,
+ chain_hits);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (l2_inacl_node,static) = {
+ .function = l2_inacl_node_fn,
+ .name = "l2-input-acl",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_inacl_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_inacl_error_strings),
+ .error_strings = l2_inacl_error_strings,
+
+ .n_next_nodes = ACL_NEXT_INDEX_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [ACL_NEXT_INDEX_DENY] = "error-drop",
+ },
+};
+
+clib_error_t *l2_inacl_init (vlib_main_t *vm)
+{
+ l2_inacl_main_t * mp = &l2_inacl_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2_inacl_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ mp->feat_next_node_index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_inacl_init);
+
+
+// set subinterface inacl enable/disable
+// The CLI format is:
+// set interface acl input <interface> [disable]
+static clib_error_t *
+int_l2_inacl (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the interface flag
+ l2input_intf_bitmap_enable(sw_if_index, L2INPUT_FEAT_ACL, enable);
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l2_inacl_cli, static) = {
+ .path = "set interface acl input",
+ .short_help = "set interface acl input <interface> [disable]",
+ .function = int_l2_inacl,
+};
diff --git a/vnet/vnet/l2/l2_input_vtr.c b/vnet/vnet/l2/l2_input_vtr.c
new file mode 100644
index 00000000000..d07a0287d04
--- /dev/null
+++ b/vnet/vnet/l2/l2_input_vtr.c
@@ -0,0 +1,314 @@
+/*
+ * l2_input_vtr.c : layer 2 input vlan tag rewrite processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_vtr.h>
+#include <vnet/l2/l2_input_vtr.h>
+#include <vnet/l2/l2_output.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/cache.h>
+
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u8 raw[12]; // raw data (vlans)
+ u32 sw_if_index;
+} l2_invtr_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_invtr_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_invtr_trace_t * t = va_arg (*args, l2_invtr_trace_t *);
+
+ s = format (s, "l2-input-vtr: sw_if_index %d dst %U src %U data "
+ "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->raw[0], t->raw[1], t->raw[2], t->raw[3], t->raw[4], t->raw[5],
+ t->raw[6], t->raw[7], t->raw[8], t->raw[9], t->raw[10], t->raw[11]);
+ return s;
+}
+
+l2_invtr_main_t l2_invtr_main;
+
+static vlib_node_registration_t l2_invtr_node;
+
+#define foreach_l2_invtr_error \
+_(L2_INVTR, "L2 inverter packets") \
+_(DROP, "L2 input tag rewrite drops")
+
+typedef enum {
+#define _(sym,str) L2_INVTR_ERROR_##sym,
+ foreach_l2_invtr_error
+#undef _
+ L2_INVTR_N_ERROR,
+} l2_invtr_error_t;
+
+static char * l2_invtr_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_invtr_error
+#undef _
+};
+
+typedef enum {
+ L2_INVTR_NEXT_DROP,
+ L2_INVTR_N_NEXT,
+} l2_invtr_next_t;
+
+
+static uword
+l2_invtr_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_invtr_next_t next_index;
+ l2_invtr_main_t * msm = &l2_invtr_main;
+ // vlib_node_t *n = vlib_get_node (vm, l2_invtr_node.index);
+ // u32 node_counter_base_index = n->error_heap_index;
+ // vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 6 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ u32 feature_bitmap0, feature_bitmap1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3, * p4, * p5;
+ u32 sw_if_index2, sw_if_index3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+
+ // Prefetch the buffer header and packet for the N+2 loop iteration
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+
+ CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE);
+
+ // Prefetch the input config for the N+1 loop iteration
+ // This depends on the buffer header above
+ sw_if_index2 = vnet_buffer(p2)->sw_if_index[VLIB_RX];
+ sw_if_index3 = vnet_buffer(p3)->sw_if_index[VLIB_RX];
+ CLIB_PREFETCH (vec_elt_at_index(l2output_main.configs, sw_if_index2), CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (vec_elt_at_index(l2output_main.configs, sw_if_index3), CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* RX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ // process 2 packets
+ // em->counters[node_counter_base_index + L2_INVTR_ERROR_L2_INVTR] += 2;
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap0 = vnet_buffer(b0)->l2.feature_bitmap & ~L2INPUT_FEAT_VTR;
+ feature_bitmap1 = vnet_buffer(b1)->l2.feature_bitmap & ~L2INPUT_FEAT_VTR;
+
+ // save for next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap0;
+ vnet_buffer(b1)->l2.feature_bitmap = feature_bitmap1;
+
+ // Determine the next node
+ next0 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ feature_bitmap0);
+ next1 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ feature_bitmap1);
+
+ // perform the tag rewrite on two packets
+ if (l2_vtr_process(b0, &(vec_elt_at_index(l2output_main.configs, sw_if_index0)->input_vtr))) {
+ // Drop packet
+ next0 = L2_INVTR_NEXT_DROP;
+ b0->error = node->errors[L2_INVTR_ERROR_DROP];
+ }
+ if (l2_vtr_process(b1, &(vec_elt_at_index(l2output_main.configs, sw_if_index1)->input_vtr))) {
+ // Drop packet
+ next1 = L2_INVTR_NEXT_DROP;
+ b1->error = node->errors[L2_INVTR_ERROR_DROP];
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED) {
+ l2_invtr_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ memcpy(t->raw, &h0->type, sizeof(t->raw));
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED) {
+ l2_invtr_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ ethernet_header_t * h1 = vlib_buffer_get_current (b1);
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ memcpy(t->raw, &h1->type, sizeof(t->raw));
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ u32 feature_bitmap0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ // process 1 packet
+ // em->counters[node_counter_base_index + L2_INVTR_ERROR_L2_INVTR] += 1;
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap0 = vnet_buffer(b0)->l2.feature_bitmap & ~L2INPUT_FEAT_VTR;
+
+ // save for next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap0;
+
+ // Determine the next node
+ next0 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ feature_bitmap0);
+
+ // perform the tag rewrite on one packet
+ if (l2_vtr_process(b0, &(vec_elt_at_index(l2output_main.configs, sw_if_index0)->input_vtr))) {
+ // Drop packet
+ next0 = L2_INVTR_NEXT_DROP;
+ b0->error = node->errors[L2_INVTR_ERROR_DROP];
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ l2_invtr_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ ethernet_header_t * h0 = vlib_buffer_get_current (b0);
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ memcpy(t->raw, &h0->type, sizeof(t->raw));
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2_invtr_node,static) = {
+ .function = l2_invtr_node_fn,
+ .name = "l2-input-vtr",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_invtr_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_invtr_error_strings),
+ .error_strings = l2_invtr_error_strings,
+
+ .n_next_nodes = L2_INVTR_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_INVTR_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2_invtr_init (vlib_main_t *vm)
+{
+ l2_invtr_main_t * mp = &l2_invtr_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2_invtr_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ mp->feat_next_node_index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_invtr_init);
+
diff --git a/vnet/vnet/l2/l2_input_vtr.h b/vnet/vnet/l2/l2_input_vtr.h
new file mode 100644
index 00000000000..57c8e409dea
--- /dev/null
+++ b/vnet/vnet/l2/l2_input_vtr.h
@@ -0,0 +1,43 @@
+/*
+ * l2_input_vtr.h : layer 2 input vlan tag rewrite processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_l2_input_vtr_h
+#define included_vnet_l2_input_vtr_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_vtr.h>
+
+
+typedef struct {
+
+ // The input vtr data is located in l2_output_config_t because
+ // the same config data is used for the egress EFP Filter check.
+
+ // Next nodes for each feature
+ u32 feat_next_node_index[32];
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_invtr_main_t;
+
+extern l2_invtr_main_t l2_invtr_main;
+
+#endif // included_vnet_l2_input_vtr_h
+
diff --git a/vnet/vnet/l2/l2_learn.c b/vnet/vnet/l2/l2_learn.c
new file mode 100644
index 00000000000..29315bedc98
--- /dev/null
+++ b/vnet/vnet/l2/l2_learn.c
@@ -0,0 +1,504 @@
+/*
+ * l2_learn.c : layer 2 learning using l2fib
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vlib/cli.h>
+
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_fib.h>
+#include <vnet/l2/l2_learn.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+
+/*
+ * Ethernet bridge learning
+ *
+ * Populate the mac table with entries mapping the packet's source mac + bridge
+ * domain ID to the input sw_if_index.
+ *
+ * Note that learning and forwarding are separate graph nodes. This means that
+ * for a set of packets, all learning is performed first, then all nodes are
+ * forwarded. The forwarding is done based on the end-state of the mac table,
+ * instead of the state after each packet. Thus the forwarding results could
+ * differ in certain cases (mac move tests), but this not expected to cause
+ * problems in real-world networks. It is much simpler to separate learning
+ * and forwarding into separate nodes.
+ */
+
+
+typedef struct {
+ u8 src[6];
+ u8 dst[6];
+ u32 sw_if_index;
+ u16 bd_index;
+} l2learn_trace_t;
+
+
+/* packet trace format function */
+static u8 * format_l2learn_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2learn_trace_t * t = va_arg (*args, l2learn_trace_t *);
+
+ s = format (s, "l2-learn: sw_if_index %d dst %U src %U bd_index %d",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src,
+ t->bd_index);
+ return s;
+}
+
+static vlib_node_registration_t l2learn_node;
+
+#define foreach_l2learn_error \
+_(L2LEARN, "L2 learn packets") \
+_(MISS, "L2 learn misses") \
+_(MAC_MOVE, "L2 mac moves") \
+_(MAC_MOVE_VIOLATE, "L2 mac move violations") \
+_(LIMIT, "L2 not learned due to limit") \
+_(HIT, "L2 learn hits") \
+_(FILTER_DROP, "L2 filter mac drops")
+
+typedef enum {
+#define _(sym,str) L2LEARN_ERROR_##sym,
+ foreach_l2learn_error
+#undef _
+ L2LEARN_N_ERROR,
+} l2learn_error_t;
+
+static char * l2learn_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2learn_error
+#undef _
+};
+
+typedef enum {
+ L2LEARN_NEXT_L2FWD,
+ L2LEARN_NEXT_DROP,
+ L2LEARN_N_NEXT,
+} l2learn_next_t;
+
+
+// Perform learning on one packet based on the mac table lookup result
+
+static_always_inline void
+l2learn_process (vlib_node_runtime_t * node,
+ l2learn_main_t * msm,
+ u64 * counter_base,
+ vlib_buffer_t * b0,
+ u32 sw_if_index0,
+ l2fib_entry_key_t * key0,
+ l2fib_entry_key_t * cached_key,
+ u32 * bucket0,
+ l2fib_entry_result_t * result0,
+ u32 * next0)
+{
+ u32 feature_bitmap;
+
+ // Set up the default next node (typically L2FWD)
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap = vnet_buffer(b0)->l2.feature_bitmap & ~L2INPUT_FEAT_LEARN;
+
+ // Save for next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap;
+
+ // Determine the next node
+ *next0 = feat_bitmap_get_next_node_index(msm->feat_next_node_index,
+ feature_bitmap);
+
+ // Check mac table lookup result
+
+ if (PREDICT_TRUE (result0->fields.sw_if_index == sw_if_index0)) {
+ // The entry was in the table, and the sw_if_index matched, the normal case
+
+ // TODO: for dataplane learning and aging, do this:
+ // if refresh=0 and not a static mac, set refresh=1
+ counter_base[L2LEARN_ERROR_HIT] += 1;
+
+ } else if (result0->raw == ~0) {
+
+ // The entry was not in table, so add it
+
+ counter_base[L2LEARN_ERROR_MISS] += 1;
+
+ if (msm->global_learn_count == msm->global_learn_limit) {
+ // Global limit reached. Do not learn the mac but forward the packet.
+ // In the future, limits could also be per-interface or bridge-domain.
+ counter_base[L2LEARN_ERROR_LIMIT] += 1;
+ goto done;
+
+ } else {
+ BVT(clib_bihash_kv) kv;
+ // It is ok to learn
+
+ result0->raw = 0; // clear all fields
+ result0->fields.sw_if_index = sw_if_index0;
+ // TODO: set timestamp in entry to clock for dataplane aging
+ kv.key = key0->raw;
+ kv.value = result0->raw;
+
+ BV(clib_bihash_add_del) (msm->mac_table, &kv, 1 /* is_add */);
+
+ cached_key->raw = ~0; // invalidate the cache
+ msm->global_learn_count++;
+ }
+
+ } else {
+
+ // The entry was in the table, but with the wrong sw_if_index mapping (mac move)
+ counter_base[L2LEARN_ERROR_MAC_MOVE] += 1;
+
+ if (result0->fields.static_mac) {
+ // Don't overwrite a static mac
+ // TODO: Check violation policy. For now drop the packet
+ b0->error = node->errors[L2LEARN_ERROR_MAC_MOVE_VIOLATE];
+ *next0 = L2LEARN_NEXT_DROP;
+ } else {
+ // Update the entry
+ // TODO: may want to rate limit mac moves
+ // TODO: check global/bridge domain/interface learn limits
+ BVT(clib_bihash_kv) kv;
+
+ result0->raw = 0; // clear all fields
+ result0->fields.sw_if_index = sw_if_index0;
+
+ kv.key = key0->raw;
+ kv.value = result0->raw;
+
+ cached_key->raw = ~0; // invalidate the cache
+
+ BV(clib_bihash_add_del) (msm->mac_table, &kv, 1 /* is_add */);
+ }
+ }
+
+ if (result0->fields.filter) {
+ // drop packet because lookup matched a filter mac entry
+
+ if (*next0 != L2LEARN_NEXT_DROP) {
+ // if we're not already dropping the packet, do it now
+ b0->error = node->errors[L2LEARN_ERROR_FILTER_DROP];
+ *next0 = L2LEARN_NEXT_DROP;
+ }
+ }
+
+done:
+ return;
+}
+
+
+static uword
+l2learn_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2learn_next_t next_index;
+ l2learn_main_t * msm = &l2learn_main;
+ vlib_node_t *n = vlib_get_node (vm, l2learn_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ l2fib_entry_key_t cached_key;
+ l2fib_entry_result_t cached_result;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ // Clear the one-entry cache in case mac table was updated
+ cached_key.raw = ~0;
+ cached_result.raw = ~0; /* warning be gone */
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ ethernet_header_t * h0, * h1;
+ l2fib_entry_key_t key0, key1;
+ l2fib_entry_result_t result0, result1;
+ u32 bucket0, bucket1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* RX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* Process 2 x pkts */
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2learn_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2learn_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->bd_index = vnet_buffer(b1)->l2.bd_index;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ /* process 2 pkts */
+ em->counters[node_counter_base_index + L2LEARN_ERROR_L2LEARN] += 2;
+
+ l2fib_lookup_2 (msm->mac_table, &cached_key, &cached_result,
+ h0->src_address,
+ h1->src_address,
+ vnet_buffer(b0)->l2.bd_index,
+ vnet_buffer(b1)->l2.bd_index,
+ &key0,
+ &key1,
+ &bucket0,
+ &bucket1,
+ &result0,
+ &result1);
+
+ l2learn_process (node, msm, &em->counters[node_counter_base_index],
+ b0, sw_if_index0, &key0, &cached_key,
+ &bucket0, &result0, &next0);
+
+ l2learn_process (node, msm, &em->counters[node_counter_base_index],
+ b1, sw_if_index1, &key1, &cached_key,
+ &bucket1, &result1, &next1);
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ethernet_header_t * h0;
+ l2fib_entry_key_t key0;
+ l2fib_entry_result_t result0;
+ u32 bucket0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ h0 = vlib_buffer_get_current (b0);
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ l2learn_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->bd_index = vnet_buffer(b0)->l2.bd_index;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ /* process 1 pkt */
+ em->counters[node_counter_base_index + L2LEARN_ERROR_L2LEARN] += 1;
+
+ l2fib_lookup_1 (msm->mac_table, &cached_key, &cached_result,
+ h0->src_address, vnet_buffer(b0)->l2.bd_index,
+ &key0,
+ &bucket0,
+ &result0);
+
+ l2learn_process (node, msm, &em->counters[node_counter_base_index],
+ b0, sw_if_index0, &key0, &cached_key,
+ &bucket0, &result0, &next0);
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2learn_node,static) = {
+ .function = l2learn_node_fn,
+ .name = "l2-learn",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2learn_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2learn_error_strings),
+ .error_strings = l2learn_error_strings,
+
+ .n_next_nodes = L2LEARN_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2LEARN_NEXT_DROP] = "error-drop",
+ [L2LEARN_NEXT_L2FWD] = "l2-fwd",
+ },
+};
+
+
+clib_error_t *l2learn_init (vlib_main_t *vm)
+{
+ l2learn_main_t * mp = &l2learn_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2learn_node.index,
+ L2INPUT_N_FEAT,
+ l2input_get_feat_names(),
+ mp->feat_next_node_index);
+
+ /* init the hash table ptr */
+ mp->mac_table = get_mac_table();
+
+ // Set the default number of dynamically learned macs to the number
+ // of buckets.
+ mp->global_learn_limit = L2FIB_NUM_BUCKETS * 16;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2learn_init);
+
+
+// set subinterface learn enable/disable
+// The CLI format is:
+// set interface l2 learn <interface> [disable]
+static clib_error_t *
+int_learn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the interface flag
+ l2input_intf_bitmap_enable(sw_if_index, L2INPUT_FEAT_LEARN, enable);
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_learn_cli, static) = {
+ .path = "set interface l2 learn",
+ .short_help = "set interface l2 learn <interface> [disable]",
+ .function = int_learn,
+};
+
+
+static clib_error_t *
+l2learn_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ l2learn_main_t *mp = &l2learn_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "limit %d", &mp->global_learn_limit))
+ ;
+
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (l2learn_config, "l2learn");
+
diff --git a/vnet/vnet/l2/l2_learn.h b/vnet/vnet/l2/l2_learn.h
new file mode 100644
index 00000000000..25674858fc9
--- /dev/null
+++ b/vnet/vnet/l2/l2_learn.h
@@ -0,0 +1,47 @@
+/*
+ * l2_learn.c : layer 2 learning using l2fib
+ *
+ * Copyright (c) 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_l2learn_h
+#define included_l2learn_h
+
+#include <vlib/vlib.h>
+#include <vnet/ethernet/ethernet.h>
+
+
+typedef struct {
+
+ // Hash table
+ BVT(clib_bihash) *mac_table;
+
+ // number of dynamically learned mac entries
+ u32 global_learn_count;
+
+ // maximum number of dynamically learned mac entries
+ u32 global_learn_limit;
+
+ // Next nodes for each feature
+ u32 feat_next_node_index[32];
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2learn_main_t;
+
+
+l2learn_main_t l2learn_main;
+
+#endif
diff --git a/vnet/vnet/l2/l2_output.c b/vnet/vnet/l2/l2_output.c
new file mode 100644
index 00000000000..72c3d0374e3
--- /dev/null
+++ b/vnet/vnet/l2/l2_output.c
@@ -0,0 +1,541 @@
+/*
+ * l2_output.c : layer 2 output packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vlib/cli.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_output.h>
+
+
+// Feature graph node names
+static char * l2output_feat_names[] = {
+#define _(sym,name) name,
+ foreach_l2output_feat
+#undef _
+};
+
+char **l2output_get_feat_names(void) {
+ return l2output_feat_names;
+}
+
+l2output_main_t l2output_main;
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u32 sw_if_index;
+} l2output_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2output_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2output_trace_t * t = va_arg (*args, l2output_trace_t *);
+
+ s = format (s, "l2-output: sw_if_index %d dst %U src %U",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src);
+ return s;
+}
+
+
+#define foreach_l2output_error \
+_(L2OUTPUT, "L2 output packets") \
+_(EFP_DROP, "L2 EFP filter pre-rewrite drops") \
+_(VTR_DROP, "L2 output tag rewrite drops") \
+_(SHG_DROP, "L2 split horizon drops") \
+_(DROP, "L2 output drops")
+
+typedef enum {
+#define _(sym,str) L2OUTPUT_ERROR_##sym,
+ foreach_l2output_error
+#undef _
+ L2OUTPUT_N_ERROR,
+} l2output_error_t;
+
+static char * l2output_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2output_error
+#undef _
+};
+
+typedef enum {
+ L2OUTPUT_NEXT_DROP,
+ L2OUTPUT_N_NEXT,
+} l2output_next_t;
+
+// Return 0 if split horizon check passes, otherwise return non-zero
+// Packets should not be transmitted out an interface with the same
+// split-horizon group as the input interface, except if the shg is 0
+// in which case the check always passes.
+static_always_inline u32
+split_horizon_violation (u8 shg1, u8 shg2)
+{
+ if (PREDICT_TRUE (shg1 == 0)) {
+ return 0;
+ } else {
+ return shg1 == shg2;
+ }
+}
+
+
+static uword
+l2output_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2output_next_t next_index;
+ l2output_main_t * msm = &l2output_main;
+ vlib_node_t *n = vlib_get_node (vm, l2output_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ u32 cached_sw_if_index;
+ u32 cached_next_index;
+
+ /* Invalidate cache */
+ cached_sw_if_index = ~0;
+ cached_next_index = ~0; /* warning be gone */
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 6 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ ethernet_header_t * h0, * h1;
+ l2_output_config_t * config0, * config1;
+ u32 feature_bitmap0, feature_bitmap1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3, * p4 , * p5;
+ u32 sw_if_index2, sw_if_index3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+ p4 = vlib_get_buffer (vm, from[4]);
+ p5 = vlib_get_buffer (vm, from[5]);
+
+ // Prefetch the buffer header for the N+2 loop iteration
+ vlib_prefetch_buffer_header (p4, LOAD);
+ vlib_prefetch_buffer_header (p5, LOAD);
+ // Note: no need to prefetch packet data. This node doesn't reference it.
+
+ // Prefetch the input config for the N+1 loop iteration
+ // This depends on the buffer header above
+ sw_if_index2 = vnet_buffer(p2)->sw_if_index[VLIB_TX];
+ sw_if_index3 = vnet_buffer(p3)->sw_if_index[VLIB_TX];
+ CLIB_PREFETCH (&msm->configs[sw_if_index2], CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (&msm->configs[sw_if_index3], CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* TX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_TX];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2output_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2output_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ em->counters[node_counter_base_index + L2OUTPUT_ERROR_L2OUTPUT] += 2;
+
+ // Get config for the output interface
+ config0 = vec_elt_at_index(msm->configs, sw_if_index0);
+ config1 = vec_elt_at_index(msm->configs, sw_if_index1);
+
+ // Get features from the config
+ // TODO: mask out any non-applicable features
+ feature_bitmap0 = config0->feature_bitmap;
+ feature_bitmap1 = config1->feature_bitmap;
+
+ // Determine next node
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2output_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b0,
+ sw_if_index0,
+ feature_bitmap0,
+ &next0);
+
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2output_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b1,
+ sw_if_index1,
+ feature_bitmap1,
+ &next1);
+
+ // Perform output vlan tag rewrite and the pre-vtr EFP filter check.
+ // The EFP Filter only needs to be run if there is an output VTR
+ // configured. The flag for the post-vtr EFP Filter node is used
+ // to trigger the pre-vtr check as well.
+
+ if (PREDICT_FALSE (config0->output_vtr.push_and_pop_bytes)) {
+ // Perform pre-vtr EFP filter check if configured
+ u32 failed1 = (feature_bitmap0 & L2OUTPUT_FEAT_EFP_FILTER) &&
+ (l2_efp_filter_process(b0, &(config0->input_vtr)));
+ u32 failed2 = l2_vtr_process(b0, &(config0->output_vtr));
+
+ if (PREDICT_FALSE (failed1 | failed2)) {
+ next0 = L2OUTPUT_NEXT_DROP;
+ if (failed2) {
+ b0->error = node->errors[L2OUTPUT_ERROR_VTR_DROP];
+ }
+ if (failed1) {
+ b0->error = node->errors[L2OUTPUT_ERROR_EFP_DROP];
+ }
+ }
+ }
+
+ if (PREDICT_FALSE (config1->output_vtr.push_and_pop_bytes)) {
+ // Perform pre-vtr EFP filter check if configured
+ u32 failed1 = (feature_bitmap1 & L2OUTPUT_FEAT_EFP_FILTER) &&
+ (l2_efp_filter_process(b1, &(config1->input_vtr)));
+ u32 failed2 = l2_vtr_process(b1, &(config1->output_vtr));
+
+ if (PREDICT_FALSE (failed1 | failed2)) {
+ next1 = L2OUTPUT_NEXT_DROP;
+ if (failed2) {
+ b1->error = node->errors[L2OUTPUT_ERROR_VTR_DROP];
+ }
+ if (failed1) {
+ b1->error = node->errors[L2OUTPUT_ERROR_EFP_DROP];
+ }
+ }
+ }
+
+ // Perform the split horizon check
+ // The check can only fail for non-zero shg's
+ if (PREDICT_FALSE (config0->shg + config1->shg)) {
+ // one of the checks might fail, check both
+ if (split_horizon_violation (config0->shg, vnet_buffer(b0)->l2.shg)) {
+ next0 = L2OUTPUT_NEXT_DROP;
+ b0->error = node->errors[L2OUTPUT_ERROR_SHG_DROP];
+ }
+ if (split_horizon_violation (config1->shg, vnet_buffer(b1)->l2.shg)) {
+ next1 = L2OUTPUT_NEXT_DROP;
+ b1->error = node->errors[L2OUTPUT_ERROR_SHG_DROP];
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ethernet_header_t * h0;
+ l2_output_config_t *config0;
+ u32 feature_bitmap0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ l2output_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ h0 = vlib_buffer_get_current (b0);
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ em->counters[node_counter_base_index + L2OUTPUT_ERROR_L2OUTPUT] += 1;
+
+ // Get config for the output interface
+ config0 = vec_elt_at_index(msm->configs, sw_if_index0);
+
+ // Get features from the config
+ // TODO: mask out any non-applicable features
+ feature_bitmap0 = config0->feature_bitmap;
+
+ // Determine next node
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2output_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b0,
+ sw_if_index0,
+ feature_bitmap0,
+ &next0);
+
+ // Perform output vlan tag rewrite and the pre-vtr EFP filter check.
+ // The EFP Filter only needs to be run if there is an output VTR
+ // configured. The flag for the post-vtr EFP Filter node is used
+ // to trigger the pre-vtr check as well.
+
+ if (config0->output_vtr.push_and_pop_bytes) {
+ // Perform pre-vtr EFP filter check if configured
+ u32 failed1 = (feature_bitmap0 & L2OUTPUT_FEAT_EFP_FILTER) &&
+ (l2_efp_filter_process(b0, &(config0->input_vtr)));
+ u32 failed2 = l2_vtr_process(b0, &(config0->output_vtr));
+
+ if (PREDICT_FALSE (failed1 | failed2)) {
+ next0 = L2OUTPUT_NEXT_DROP;
+ if (failed2) {
+ b0->error = node->errors[L2OUTPUT_ERROR_VTR_DROP];
+ }
+ if (failed1) {
+ b0->error = node->errors[L2OUTPUT_ERROR_EFP_DROP];
+ }
+ }
+ }
+
+ // Perform the split horizon check
+ if (PREDICT_FALSE (split_horizon_violation (config0->shg, vnet_buffer(b0)->l2.shg))) {
+ next0 = L2OUTPUT_NEXT_DROP;
+ b0->error = node->errors[L2OUTPUT_ERROR_SHG_DROP];
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2output_node) = {
+ .function = l2output_node_fn,
+ .name = "l2-output",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2output_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2output_error_strings),
+ .error_strings = l2output_error_strings,
+
+ .n_next_nodes = L2OUTPUT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2OUTPUT_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2output_init (vlib_main_t *vm)
+{
+ l2output_main_t * mp = &l2output_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Create the config vector
+ vec_validate(mp->configs, 100);
+ // Until we hook up the CLI config, just create 100 sw interface entries and zero them
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2output_node.index,
+ L2OUTPUT_N_FEAT,
+ l2output_get_feat_names(),
+ mp->next_nodes.feat_next_node_index);
+
+ // Initialize the output node mapping table
+ l2output_init_output_node_vec(&mp->next_nodes.output_node_index_vec);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2output_init);
+
+typedef struct {
+ u32 node_index;
+ u32 sw_if_index;
+} output_node_mapping_rpc_args_t;
+
+#if DPDK > 0
+static void output_node_rpc_callback
+( output_node_mapping_rpc_args_t * a);
+
+static void output_node_mapping_send_rpc
+(u32 node_index,
+ u32 sw_if_index)
+{
+ output_node_mapping_rpc_args_t args;
+
+ args.node_index = node_index;
+ args.sw_if_index = sw_if_index;
+
+ vl_api_rpc_call_main_thread (output_node_rpc_callback,
+ (u8 *) &args, sizeof (args));
+}
+#endif
+
+
+// Create a mapping in the next node mapping table for the given sw_if_index
+u32 l2output_create_output_node_mapping (
+ vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 node_index, // index of current node
+ u32 * output_node_index_vec,
+ u32 sw_if_index) {
+
+ u32 next; // index of next graph node
+ vnet_hw_interface_t *hw0;
+ u32 *node;
+#if DPDK > 0
+ uword cpu_number;
+
+ cpu_number = os_get_cpu_number();
+
+ if (cpu_number)
+ {
+ output_node_mapping_send_rpc (node_index, sw_if_index);
+ return 0;
+ }
+#endif
+
+ hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index);
+
+ // dynamically create graph node arc
+ next = vlib_node_add_next (vlib_main,
+ node_index,
+ hw0->output_node_index);
+
+ // Initialize vector with the mapping
+
+ node = vec_elt_at_index(output_node_index_vec, sw_if_index);
+ *node = next;
+
+ return next;
+}
+
+#if DPDK > 0
+void output_node_rpc_callback (output_node_mapping_rpc_args_t *a)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vnet_main_t * vnm = vnet_get_main();
+ l2output_main_t * mp = &l2output_main;
+
+ (void) l2output_create_output_node_mapping
+ (vm, vnm, a->node_index, mp->next_nodes.output_node_index_vec,
+ a->sw_if_index);
+}
+#endif
+
+// Get a pointer to the config for the given interface
+l2_output_config_t * l2output_intf_config (u32 sw_if_index)
+{
+ l2output_main_t * mp = &l2output_main;
+
+ vec_validate(mp->configs, sw_if_index);
+ return vec_elt_at_index(mp->configs, sw_if_index);
+}
+
+// Enable (or disable) the feature in the bitmap for the given interface
+void l2output_intf_bitmap_enable (u32 sw_if_index,
+ u32 feature_bitmap,
+ u32 enable)
+{
+ l2output_main_t * mp = &l2output_main;
+ l2_output_config_t *config;
+
+ vec_validate(mp->configs, sw_if_index);
+ config = vec_elt_at_index(mp->configs, sw_if_index);
+
+ if (enable) {
+ config->feature_bitmap |= feature_bitmap;
+ } else {
+ config->feature_bitmap &= ~feature_bitmap;
+ }
+}
diff --git a/vnet/vnet/l2/l2_output.h b/vnet/vnet/l2/l2_output.h
new file mode 100644
index 00000000000..0d171b82541
--- /dev/null
+++ b/vnet/vnet/l2/l2_output.h
@@ -0,0 +1,219 @@
+/*
+ * l2_output.h : layer 2 output packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_l2_output_h
+#define included_vnet_l2_output_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_vtr.h>
+
+
+// The L2 output feature configuration, a per-interface struct
+typedef struct {
+
+ u32 feature_bitmap;
+
+ // vlan tag rewrite for ingress and egress
+ // ingress vtr is located here because the same config data is used for
+ // the egress EFP filter check
+ vtr_config_t input_vtr;
+ vtr_config_t output_vtr;
+
+ // some of these flags may get integrated into the feature bitmap
+ u8 fwd_enable;
+ u8 flood_enable;
+
+ // split horizon group
+ u8 shg;
+
+} l2_output_config_t;
+
+
+// The set of next nodes for features and interface output.
+// Each output feature node should include this.
+typedef struct {
+ // vector of output next node index, indexed by sw_if_index.
+ // used when all output features have been executed and the
+ // next nodes are the interface output nodes.
+ u32 * output_node_index_vec;
+
+ // array of next node index for each output feature, indexed
+ // by l2output_feat_t. Used to determine next feature node.
+ u32 feat_next_node_index[32];
+
+} l2_output_next_nodes_st;
+
+
+typedef struct {
+ // Next nodes for features and output interfaces
+ l2_output_next_nodes_st next_nodes;
+
+ /* config vector indexed by sw_if_index */
+ l2_output_config_t *configs;
+
+ /* Convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2output_main_t;
+
+l2output_main_t l2output_main;
+vlib_node_registration_t l2output_node;
+
+// L2 output features
+
+// Mappings from feature ID to graph node name
+#define foreach_l2output_feat \
+ _(SPAN, "feature-bitmap-drop") \
+ _(CFM, "feature-bitmap-drop") \
+ _(QOS, "feature-bitmap-drop") \
+ _(ACL, "l2-output-acl") \
+ _(L2PT, "feature-bitmap-drop") \
+ _(EFP_FILTER, "l2-efp-filter") \
+ _(IPIW, "feature-bitmap-drop") \
+ _(STP_BLOCKED, "feature-bitmap-drop") \
+ _(LINESTATUS_DOWN, "feature-bitmap-drop") \
+ _(XCRW, "l2-xcrw")
+
+// Feature bitmap positions
+typedef enum {
+#define _(sym,str) L2OUTPUT_FEAT_##sym##_BIT,
+ foreach_l2output_feat
+#undef _
+ L2OUTPUT_N_FEAT,
+} l2output_feat_t;
+
+// Feature bit masks
+typedef enum {
+#define _(sym,str) L2OUTPUT_FEAT_##sym = (1<<L2OUTPUT_FEAT_##sym##_BIT),
+ foreach_l2output_feat
+#undef _
+} l2output_feat_masks_t;
+
+// Return an array of strings containing graph node names of each feature
+char **l2output_get_feat_names(void);
+
+
+// The next set of functions is for use by output feature graph nodes.
+// When the last bit has been cleared from the output feature bitmap,
+// the next node is the output graph node for the TX sw_if_index.
+// These functions help the feature nodes get that node index.
+
+// Create a mapping to the output graph node for the given sw_if_index
+u32 l2output_create_output_node_mapping (
+ vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 node_index, // index of current node
+ u32 * output_node_index_vec,
+ u32 sw_if_index);
+
+// Initialize the next node mapping table
+always_inline
+void l2output_init_output_node_vec (u32 **output_node_index_vec) {
+
+ // Size it at 100 sw_if_indexes initially
+ // Uninitialized mappings are set to ~0
+ vec_validate_init_empty(*output_node_index_vec, 100, ~0);
+}
+
+
+// Get a mapping from the output node mapping table,
+// creating the entry if necessary.
+always_inline
+u32 l2output_get_output_node (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 node_index, // index of current node
+ u32 sw_if_index,
+ u32 ** output_node_index_vec) // may be updated
+{
+ u32 next; // index of next graph node
+
+ // Insure the vector is big enough
+ vec_validate_init_empty(*output_node_index_vec, sw_if_index, ~0);
+
+ // Get the mapping for the sw_if_index
+ next = vec_elt(*output_node_index_vec, sw_if_index);
+
+ if (next == ~0) {
+ // Mapping doesn't exist so create it
+ next = l2output_create_output_node_mapping (vlib_main,
+ vnet_main,
+ node_index,
+ *output_node_index_vec,
+ sw_if_index);
+ }
+
+ return next;
+}
+
+
+// Determine the next L2 node based on the output feature bitmap
+always_inline void
+l2_output_dispatch (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ vlib_node_runtime_t * node,
+ u32 node_index,
+ u32 * cached_sw_if_index,
+ u32 * cached_next_index,
+ l2_output_next_nodes_st *next_nodes,
+ vlib_buffer_t * b0,
+ u32 sw_if_index,
+ u32 feature_bitmap,
+ u32 *next0)
+{
+ if (feature_bitmap) {
+ // There are some features to execute
+
+ // Save bitmap for the next feature graph nodes
+ vnet_buffer(b0)->l2.feature_bitmap = feature_bitmap;
+
+ // Determine the next node
+ *next0 = feat_bitmap_get_next_node_index(next_nodes->feat_next_node_index,
+ feature_bitmap);
+ } else {
+ // There are no features. Send packet to TX node for sw_if_index0
+ // This is a little tricky in that the output interface next node indexes
+ // are not precomputed at init time.
+
+ if (sw_if_index == *cached_sw_if_index) {
+ // We hit in the one-entry cache. Use it.
+ *next0 = *cached_next_index;
+ } else {
+ // Look up the output TX node
+ *next0 = l2output_get_output_node(vlib_main,
+ vnet_main,
+ node_index,
+ sw_if_index,
+ &next_nodes->output_node_index_vec);
+
+ // Update the one-entry cache
+ *cached_sw_if_index = sw_if_index;
+ *cached_next_index = *next0;
+ }
+ }
+}
+
+// Get a pointer to the config for the given interface
+l2_output_config_t * l2output_intf_config (u32 sw_if_index);
+
+// Enable (or disable) the feature in the bitmap for the given interface
+void l2output_intf_bitmap_enable (u32 sw_if_index,
+ u32 feature_bitmap,
+ u32 enable);
+
+#endif
diff --git a/vnet/vnet/l2/l2_output_acl.c b/vnet/vnet/l2/l2_output_acl.c
new file mode 100644
index 00000000000..2f6c1dce41f
--- /dev/null
+++ b/vnet/vnet/l2/l2_output_acl.c
@@ -0,0 +1,335 @@
+/*
+ * l2_output_acl.c : layer 2 output acl processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/ip/ip_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/ip6_packet.h>
+#include <vlib/cli.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_output.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/cache.h>
+
+
+typedef struct {
+ // Next nodes for features and output interfaces
+ l2_output_next_nodes_st next_nodes;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_outacl_main_t;
+
+
+
+typedef struct {
+ /* per-pkt trace data */
+ u8 src[6];
+ u8 dst[6];
+ u32 next_index;
+ u32 sw_if_index;
+} l2_outacl_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_outacl_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_outacl_trace_t * t = va_arg (*args, l2_outacl_trace_t *);
+
+ s = format (s, "l2-output-acl: sw_if_index %d dst %U src %U",
+ t->sw_if_index,
+ format_ethernet_address, t->dst,
+ format_ethernet_address, t->src);
+ return s;
+}
+
+l2_outacl_main_t l2_outacl_main;
+
+static vlib_node_registration_t l2_outacl_node;
+
+#define foreach_l2_outacl_error \
+_(L2_OUTACL, "L2 output ACL packets") \
+_(DROP, "L2 output drops")
+
+typedef enum {
+#define _(sym,str) L2_OUTACL_ERROR_##sym,
+ foreach_l2_outacl_error
+#undef _
+ L2_OUTACL_N_ERROR,
+} l2_outacl_error_t;
+
+static char * l2_outacl_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_outacl_error
+#undef _
+};
+
+typedef enum {
+ L2_OUTACL_NEXT_DROP,
+ L2_OUTACL_N_NEXT,
+} l2_outacl_next_t;
+
+
+
+static uword
+l2_outacl_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_outacl_next_t next_index;
+ l2_outacl_main_t * msm = &l2_outacl_main;
+ vlib_node_t *n = vlib_get_node (vm, l2_outacl_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ u32 cached_sw_if_index = (u32)~0;
+ u32 cached_next_index = (u32)~0;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors; /* number of packets to process */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ /* get space to enqueue frame to graph node "next_index" */
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (0 && n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ ethernet_header_t * h0, * h1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ /* bi is "buffer index", b is pointer to the buffer */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* TX interface handles */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_TX];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2_outacl_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2_outacl_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ memcpy(t->src, h1->src_address, 6);
+ memcpy(t->dst, h1->dst_address, 6);
+ }
+ }
+
+ em->counters[node_counter_base_index + L2_OUTACL_ERROR_L2_OUTACL] += 2;
+
+ /* add core loop code here */
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ /* if next0==next1==next_index then nothing special needs to be done */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ethernet_header_t * h0;
+ u32 feature_bitmap0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ l2_outacl_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ memcpy(t->src, h0->src_address, 6);
+ memcpy(t->dst, h0->dst_address, 6);
+ }
+
+ em->counters[node_counter_base_index + L2_OUTACL_ERROR_L2_OUTACL] += 1;
+
+ // L2_OUTACL code
+ // Dummy for now, just go to next feature node
+
+
+ // Remove ourself from the feature bitmap
+ feature_bitmap0 = vnet_buffer(b0)->l2.feature_bitmap & ~L2OUTPUT_FEAT_ACL;
+
+ // Determine next node
+ l2_output_dispatch (msm->vlib_main,
+ msm->vnet_main,
+ node,
+ l2_outacl_node.index,
+ &cached_sw_if_index,
+ &cached_next_index,
+ &msm->next_nodes,
+ b0,
+ sw_if_index0,
+ feature_bitmap0,
+ &next0);
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (l2_outacl_node,static) = {
+ .function = l2_outacl_node_fn,
+ .name = "l2-output-acl",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_outacl_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_outacl_error_strings),
+ .error_strings = l2_outacl_error_strings,
+
+ .n_next_nodes = L2_OUTACL_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_OUTACL_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2_outacl_init (vlib_main_t *vm)
+{
+ l2_outacl_main_t * mp = &l2_outacl_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ // Initialize the feature next-node indexes
+ feat_bitmap_init_next_nodes(vm,
+ l2_outacl_node.index,
+ L2OUTPUT_N_FEAT,
+ l2output_get_feat_names(),
+ mp->next_nodes.feat_next_node_index);
+
+ // Initialize the output node mapping table
+ l2output_init_output_node_vec(&mp->next_nodes.output_node_index_vec);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_outacl_init);
+
+// set subinterface outacl enable/disable
+// The CLI format is:
+// set interface acl output <interface> [disable]
+static clib_error_t *
+int_l2_outacl (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 enable;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ enable = 1;
+ if (unformat (input, "disable")) {
+ enable = 0;
+ }
+
+ // set the interface flag
+ l2output_intf_bitmap_enable(sw_if_index, L2OUTPUT_FEAT_ACL, enable);
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l2_outacl_cli, static) = {
+ .path = "set interface acl output",
+ .short_help = "set interface acl output <interface> [disable]",
+ .function = int_l2_outacl,
+};
diff --git a/vnet/vnet/l2/l2_patch.c b/vnet/vnet/l2/l2_patch.c
new file mode 100644
index 00000000000..63be409d3b8
--- /dev/null
+++ b/vnet/vnet/l2/l2_patch.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/error.h>
+
+typedef struct {
+ u32 cached_next_index;
+ u32 cached_rx_sw_if_index;
+
+ /* vector of dispositions, indexed by rx_sw_if_index */
+ u32 *tx_next_by_rx_sw_if_index;
+ u32 *tx_sw_if_index_by_rx_sw_if_index;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_patch_main_t;
+
+typedef struct {
+ u32 rx_sw_if_index;
+ u32 tx_sw_if_index;
+} l2_patch_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_patch_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_patch_trace_t * t = va_arg (*args, l2_patch_trace_t *);
+
+ s = format (s, "L2_PATCH: rx %d tx %d", t->rx_sw_if_index,
+ t->tx_sw_if_index);
+ return s;
+}
+
+l2_patch_main_t l2_patch_main;
+
+static vlib_node_registration_t l2_patch_node;
+
+#define foreach_l2_patch_error \
+_(PATCHED, "L2 patch packets") \
+_(DROPPED, "L2 patch misconfigured drops")
+
+typedef enum {
+#define _(sym,str) L2_PATCH_ERROR_##sym,
+ foreach_l2_patch_error
+#undef _
+ L2_PATCH_N_ERROR,
+} l2_patch_error_t;
+
+static char * l2_patch_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_patch_error
+#undef _
+};
+
+typedef enum {
+ L2_PATCH_NEXT_DROP,
+ L2_PATCH_N_NEXT,
+} l2_patch_next_t;
+
+static uword
+l2_patch_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_patch_next_t next_index;
+ l2_patch_main_t * l2pm = &l2_patch_main;
+ vlib_node_t *n = vlib_get_node (vm, l2_patch_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ /* So stupid / simple, we don't need to prefetch data */
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ ASSERT(l2pm->tx_next_by_rx_sw_if_index[sw_if_index0] != ~0);
+ ASSERT(l2pm->tx_sw_if_index_by_rx_sw_if_index[sw_if_index0] != ~0);
+ ASSERT(l2pm->tx_next_by_rx_sw_if_index[sw_if_index1] != ~0);
+ ASSERT(l2pm->tx_sw_if_index_by_rx_sw_if_index[sw_if_index1] != ~0);
+
+ if (PREDICT_TRUE (sw_if_index0 == l2pm->cached_rx_sw_if_index))
+ next0 = l2pm->cached_next_index;
+ else
+ {
+ next0 = l2pm->tx_next_by_rx_sw_if_index[sw_if_index0];
+ l2pm->cached_rx_sw_if_index = sw_if_index0;
+ l2pm->cached_next_index = next0;
+ }
+
+ if (PREDICT_TRUE (sw_if_index1 == l2pm->cached_rx_sw_if_index))
+ next1 = l2pm->cached_next_index;
+ else
+ next1 = l2pm->tx_next_by_rx_sw_if_index [sw_if_index1];
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2_patch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->rx_sw_if_index = sw_if_index0;
+ t->tx_sw_if_index =
+ l2pm->tx_sw_if_index_by_rx_sw_if_index [sw_if_index0];
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2_patch_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->rx_sw_if_index = sw_if_index1;
+ t->tx_sw_if_index =
+ l2pm->tx_sw_if_index_by_rx_sw_if_index [sw_if_index1];
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ ASSERT(l2pm->tx_next_by_rx_sw_if_index[sw_if_index0] != ~0);
+ ASSERT(l2pm->tx_sw_if_index_by_rx_sw_if_index[sw_if_index0] != ~0);
+
+ if (PREDICT_TRUE (sw_if_index0 == l2pm->cached_rx_sw_if_index))
+ next0 = l2pm->cached_next_index;
+ else
+ {
+ next0 = l2pm->tx_next_by_rx_sw_if_index [sw_if_index0];
+ l2pm->cached_rx_sw_if_index = sw_if_index0;
+ l2pm->cached_next_index = next0;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ l2_patch_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->rx_sw_if_index = sw_if_index0;
+ t->tx_sw_if_index =
+ l2pm->tx_sw_if_index_by_rx_sw_if_index [sw_if_index0];
+ }
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ em->counters[node_counter_base_index + L2_PATCH_ERROR_PATCHED] +=
+ frame->n_vectors;
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (l2_patch_node, static) = {
+ .function = l2_patch_node_fn,
+ .name = "l2_patch",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_patch_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_patch_error_strings),
+ .error_strings = l2_patch_error_strings,
+
+ .n_next_nodes = L2_PATCH_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_PATCH_NEXT_DROP] = "error-drop",
+ },
+};
+
+int vnet_l2_patch_add_del (u32 rx_sw_if_index, u32 tx_sw_if_index, int is_add)
+{
+ l2_patch_main_t * l2pm = &l2_patch_main;
+ vnet_hw_interface_t * rxhi, *txhi;
+ u32 tx_next_index;
+
+ /*
+ * We assume that the API msg handler has used 2x VALIDATE_SW_IF_INDEX
+ * macros...
+ */
+
+ rxhi = vnet_get_sup_hw_interface (l2pm->vnet_main, rx_sw_if_index);
+
+ /* Make sure caller didn't pass a vlan subif, etc. */
+ if (rxhi->sw_if_index != rx_sw_if_index)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ txhi = vnet_get_sup_hw_interface (l2pm->vnet_main, tx_sw_if_index);
+ if (txhi->sw_if_index != tx_sw_if_index)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX_2;
+
+ if (is_add)
+ {
+ tx_next_index = vlib_node_add_next (l2pm->vlib_main,
+ l2_patch_node.index,
+ txhi->output_node_index);
+
+ vec_validate_init_empty (l2pm->tx_next_by_rx_sw_if_index,
+ rx_sw_if_index, ~0);
+
+ l2pm->tx_next_by_rx_sw_if_index[rx_sw_if_index] = tx_next_index;
+ vec_validate_init_empty (l2pm->tx_sw_if_index_by_rx_sw_if_index,
+ rx_sw_if_index, ~0);
+ l2pm->tx_sw_if_index_by_rx_sw_if_index[rx_sw_if_index]
+ = txhi->sw_if_index;
+
+ ethernet_set_flags (l2pm->vnet_main, rxhi->hw_if_index,
+ ETHERNET_INTERFACE_FLAG_ACCEPT_ALL);
+
+ vnet_hw_interface_rx_redirect_to_node (l2pm->vnet_main,
+ rxhi->hw_if_index,
+ l2_patch_node.index);
+ }
+ else
+ {
+ ethernet_set_flags (l2pm->vnet_main, rxhi->hw_if_index,
+ 0 /* disable promiscuous mode */);
+
+ vnet_hw_interface_rx_redirect_to_node (l2pm->vnet_main,
+ rxhi->hw_if_index,
+ ~0 /* disable */);
+ if (vec_len (l2pm->tx_next_by_rx_sw_if_index) > rx_sw_if_index)
+ {
+ l2pm->tx_next_by_rx_sw_if_index[rx_sw_if_index] = ~0;
+ l2pm->tx_sw_if_index_by_rx_sw_if_index[rx_sw_if_index] = ~0;
+ }
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+test_patch_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2_patch_main_t * l2pm = &l2_patch_main;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 rx_sw_if_index, tx_sw_if_index;
+ int rv;
+ int rx_set = 0;
+ int tx_set = 0;
+ int is_add = 1;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "rx %U", unformat_vnet_sw_interface,
+ l2pm->vnet_main, &rx_sw_if_index))
+ rx_set = 1;
+ else if (unformat (line_input, "tx %U", unformat_vnet_sw_interface,
+ l2pm->vnet_main, &tx_sw_if_index))
+ tx_set = 1;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else break;
+ }
+
+ if (rx_set == 0)
+ return clib_error_return (0, "rx interface not set");
+
+ if (tx_set == 0)
+ return clib_error_return (0, "tx interface not set");
+
+ rv = vnet_l2_patch_add_del (rx_sw_if_index, tx_sw_if_index, is_add);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX:
+ return clib_error_return (0, "rx interface not a physical port");
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX_2:
+ return clib_error_return (0, "tx interface not a physical port");
+
+ default:
+ return clib_error_return
+ (0, "WARNING: vnet_l2_patch_add_del returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_patch_command, static) = {
+ .path = "test l2patch",
+ .short_help =
+ "rx <intfc> tx <intfc> [del]",
+ .function = test_patch_command_fn,
+};
+
+// Display the contents of the l2patch table.
+static clib_error_t *
+show_l2patch (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2_patch_main_t * l2pm = &l2_patch_main;
+ u32 rx_sw_if_index;
+ u32 no_entries = 1;
+
+ ASSERT(vec_len(l2pm->tx_next_by_rx_sw_if_index) ==
+ vec_len(l2pm->tx_sw_if_index_by_rx_sw_if_index));
+
+ for (rx_sw_if_index = 0;
+ rx_sw_if_index < vec_len (l2pm->tx_sw_if_index_by_rx_sw_if_index);
+ rx_sw_if_index++)
+ {
+ u32 tx_sw_if_index =
+ l2pm->tx_sw_if_index_by_rx_sw_if_index[rx_sw_if_index];
+ if (tx_sw_if_index != ~0)
+ {
+ no_entries = 0;
+ vlib_cli_output (vm, "%26U -> %U",
+ format_vnet_sw_if_index_name,
+ l2pm->vnet_main, rx_sw_if_index,
+ format_vnet_sw_if_index_name,
+ l2pm->vnet_main,tx_sw_if_index);
+ }
+ }
+
+ if (no_entries)
+ vlib_cli_output (vm, "no l2patch entries");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_l2patch_cli, static) = {
+ .path = "show l2patch",
+ .short_help = "Show l2 interface cross-connect entries",
+ .function = show_l2patch,
+};
+
+clib_error_t *l2_patch_init (vlib_main_t *vm)
+{
+ l2_patch_main_t * mp = &l2_patch_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_patch_init);
diff --git a/vnet/vnet/l2/l2_vtr.c b/vnet/vnet/l2/l2_vtr.c
new file mode 100644
index 00000000000..a7499041009
--- /dev/null
+++ b/vnet/vnet/l2/l2_vtr.c
@@ -0,0 +1,448 @@
+/*
+ * l2_vtr.c : layer 2 vlan tag rewrite configuration
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/l2_output.h>
+#include <vnet/l2/feat_bitmap.h>
+#include <vnet/l2/l2_vtr.h>
+#include <vnet/l2/l2_input_vtr.h>
+#include <vnet/l2/l2_output.h>
+
+#include <vppinfra/error.h>
+#include <vlib/cli.h>
+
+
+// Just a placeholder. Also insures file is not eliminated by linker.
+clib_error_t *l2_vtr_init (vlib_main_t *vm)
+{
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(l2_vtr_init);
+
+
+// Configure vtag tag rewrite on the given interface.
+// Return 1 if there is an error, 0 if ok
+u32 l2vtr_configure (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 sw_if_index,
+ u32 vtr_op,
+ u32 push_dot1q, // ethertype of first pushed tag is dot1q/dot1ad
+ u32 vtr_tag1, // first pushed tag
+ u32 vtr_tag2) // second pushed tag
+{
+ vnet_hw_interface_t * hi;
+ vnet_sw_interface_t * si;
+ u32 hw_no_tags;
+ u32 error = 0;
+ vtr_config_t * in_config;
+ vtr_config_t * out_config;
+ u32 enable;
+ u32 push_inner_et;
+ u32 push_outer_et;
+ u32 cfg_tags;
+
+ hi = vnet_get_sup_hw_interface (vnet_main, sw_if_index);
+ if (!hi || (hi->hw_class_index != ethernet_hw_interface_class.index)) {
+ error = VNET_API_ERROR_INVALID_INTERFACE; // non-ethernet interface
+ goto done;
+ }
+
+ // Init the config for this interface
+ vec_validate (l2output_main.configs, sw_if_index);
+ in_config = &(vec_elt_at_index(l2output_main.configs, sw_if_index)->input_vtr);
+ out_config = &(vec_elt_at_index(l2output_main.configs, sw_if_index)->output_vtr);
+ in_config->raw_tags = 0;
+ out_config->raw_tags = 0;
+
+ // Get the configured tags for the interface
+ si = vnet_get_sw_interface (vnet_main, sw_if_index);
+ hw_no_tags = (si->type == VNET_SW_INTERFACE_TYPE_HARDWARE);
+
+ // Construct the input tag-rewrite config
+
+ push_outer_et = clib_net_to_host_u16 (push_dot1q ? ETHERNET_TYPE_VLAN : ETHERNET_TYPE_DOT1AD);
+ push_inner_et = clib_net_to_host_u16 (ETHERNET_TYPE_VLAN);
+ vtr_tag1 = clib_net_to_host_u16 (vtr_tag1);
+ vtr_tag2 = clib_net_to_host_u16 (vtr_tag2);
+
+ // Determine number of vlan tags with explictly configured values
+ cfg_tags = 0;
+ if (hw_no_tags || si->sub.eth.flags.no_tags) {
+ cfg_tags = 0;
+ } else if (si->sub.eth.flags.one_tag) {
+ cfg_tags = 1;
+ if (si->sub.eth.flags.outer_vlan_id_any) {
+ cfg_tags = 0;
+ }
+ } else if (si->sub.eth.flags.two_tags) {
+ cfg_tags = 2;
+ if (si->sub.eth.flags.inner_vlan_id_any) {
+ cfg_tags = 1;
+ }
+ if (si->sub.eth.flags.outer_vlan_id_any) {
+ cfg_tags = 0;
+ }
+ }
+
+ switch (vtr_op) {
+ case L2_VTR_DISABLED:
+ in_config->push_and_pop_bytes = 0;
+ break;
+
+ case L2_VTR_POP_1:
+ if (cfg_tags < 1) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need one or two tags
+ goto done;
+ }
+ in_config->pop_bytes = 4;
+ in_config->push_bytes = 0;
+ break;
+
+ case L2_VTR_POP_2:
+ if (cfg_tags < 2) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need two tags
+ goto done;
+ }
+ in_config->pop_bytes = 8;
+ in_config->push_bytes = 0;
+
+ out_config->push_bytes = in_config->pop_bytes;
+ out_config->pop_bytes = in_config->push_bytes;
+ break;
+
+ case L2_VTR_PUSH_1:
+ in_config->pop_bytes = 0;
+ in_config->push_bytes = 4;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[1].type = push_outer_et;
+ break;
+
+ case L2_VTR_PUSH_2:
+ in_config->pop_bytes = 0;
+ in_config->push_bytes = 8;
+ in_config->tags[0].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[0].type = push_outer_et;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag2;
+ in_config->tags[1].type = push_inner_et;
+ break;
+
+ case L2_VTR_TRANSLATE_1_1:
+ if (cfg_tags < 1) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need one or two tags
+ goto done;
+ }
+ in_config->pop_bytes = 4;
+ in_config->push_bytes = 4;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[1].type = push_outer_et;
+ break;
+
+ case L2_VTR_TRANSLATE_1_2:
+ if (cfg_tags < 1) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need one or two tags
+ goto done;
+ }
+ in_config->pop_bytes = 4;
+ in_config->push_bytes = 8;
+ in_config->tags[0].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[0].type = push_outer_et;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag2;
+ in_config->tags[1].type = push_inner_et;
+ break;
+
+ case L2_VTR_TRANSLATE_2_1:
+ if (cfg_tags < 2) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need two tags
+ goto done;
+ }
+ in_config->pop_bytes = 8;
+ in_config->push_bytes = 4;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[1].type = push_outer_et;
+ break;
+
+ case L2_VTR_TRANSLATE_2_2:
+ if (cfg_tags < 2) {
+ error = VNET_API_ERROR_INVALID_VLAN_TAG_COUNT; // Need two tags
+ goto done;
+ }
+ in_config->pop_bytes = 8;
+ in_config->push_bytes = 8;
+ in_config->tags[0].priority_cfi_and_id = vtr_tag1;
+ in_config->tags[0].type = push_outer_et;
+ in_config->tags[1].priority_cfi_and_id = vtr_tag2;
+ in_config->tags[1].type = push_inner_et;
+ break;
+ }
+
+ // Construct the output tag-rewrite config
+
+ // The push/pop values are always reversed
+ out_config->push_bytes = in_config->pop_bytes;
+ out_config->pop_bytes = in_config->push_bytes;
+
+ // Any pushed tags are derived from the subinterface config
+ push_outer_et = clib_net_to_host_u16 (si->sub.eth.flags.dot1ad ? ETHERNET_TYPE_DOT1AD : ETHERNET_TYPE_VLAN);
+ push_inner_et = clib_net_to_host_u16 (ETHERNET_TYPE_VLAN);
+ vtr_tag1 = clib_net_to_host_u16 (si->sub.eth.outer_vlan_id);
+ vtr_tag2 = clib_net_to_host_u16 (si->sub.eth.inner_vlan_id);
+
+ if (out_config->push_bytes == 4) {
+ out_config->tags[1].priority_cfi_and_id = vtr_tag1;
+ out_config->tags[1].type = push_outer_et;
+ } else if (out_config->push_bytes == 8) {
+ out_config->tags[0].priority_cfi_and_id = vtr_tag1;
+ out_config->tags[0].type = push_outer_et;
+ out_config->tags[1].priority_cfi_and_id = vtr_tag2;
+ out_config->tags[1].type = push_inner_et;
+ }
+
+ // set the interface enable flags
+ enable = (vtr_op != L2_VTR_DISABLED);
+ l2input_intf_bitmap_enable (sw_if_index, L2INPUT_FEAT_VTR, enable);
+ // output vtr enable is checked explicitly in l2_output
+
+ done:
+ return error;
+}
+
+// Get vtag tag rewrite on the given interface.
+// Return 1 if there is an error, 0 if ok
+u32 l2vtr_get (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 sw_if_index,
+ u32 *vtr_op,
+ u32 *push_dot1q, // ethertype of first pushed tag is dot1q/dot1ad
+ u32 *vtr_tag1, // first pushed tag
+ u32 *vtr_tag2) // second pushed tag
+{
+ vnet_hw_interface_t * hi;
+ u32 error = 0;
+ vtr_config_t * in_config;
+
+ if (!vtr_op || !push_dot1q || !vtr_tag1 || !vtr_tag2) {
+ clib_warning ("invalid arguments");
+ error = VNET_API_ERROR_INVALID_ARGUMENT;
+ goto done;
+ }
+
+ *vtr_op = L2_VTR_DISABLED;
+ *vtr_tag1 = 0;
+ *vtr_tag2 = 0;
+ *push_dot1q = 0;
+
+ hi = vnet_get_sup_hw_interface (vnet_main, sw_if_index);
+ if (!hi || (hi->hw_class_index != ethernet_hw_interface_class.index)) {
+ // non-ethernet interface
+ goto done;
+ }
+
+ if (sw_if_index >= vec_len(l2output_main.configs)) {
+ // no specific config (return disabled)
+ goto done;
+ }
+
+ // Get the config for this interface
+ in_config = &(vec_elt_at_index(l2output_main.configs, sw_if_index)->input_vtr);
+
+ // DISABLED
+ if (in_config->push_and_pop_bytes == 0) {
+ goto done;
+ }
+
+ // find out vtr_op
+ switch (in_config->pop_bytes) {
+ case 0:
+ switch (in_config->push_bytes) {
+ case 0:
+ // DISABLED
+ goto done;
+ case 4:
+ *vtr_op = L2_VTR_PUSH_1;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[1].type));
+ break;
+ case 8:
+ *vtr_op = L2_VTR_PUSH_2;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[0].priority_cfi_and_id);
+ *vtr_tag2 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[0].type));
+ break;
+ default:
+ clib_warning ("invalid push_bytes count: %d", in_config->push_bytes);
+ error = VNET_API_ERROR_UNEXPECTED_INTF_STATE;
+ goto done;
+ }
+ break;
+
+ case 4:
+ switch (in_config->push_bytes) {
+ case 0:
+ *vtr_op = L2_VTR_POP_1;
+ break;
+ case 4:
+ *vtr_op = L2_VTR_TRANSLATE_1_1;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[1].type));
+ break;
+ case 8:
+ *vtr_op = L2_VTR_TRANSLATE_1_2;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[0].priority_cfi_and_id);
+ *vtr_tag2 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[0].type));
+ break;
+ default:
+ clib_warning ("invalid push_bytes count: %d", in_config->push_bytes);
+ error = VNET_API_ERROR_UNEXPECTED_INTF_STATE;
+ goto done;
+ }
+ break;
+
+ case 8:
+ switch (in_config->push_bytes) {
+ case 0:
+ *vtr_op = L2_VTR_POP_2;
+ break;
+ case 4:
+ *vtr_op = L2_VTR_TRANSLATE_2_1;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[1].type));
+ break;
+ case 8:
+ *vtr_op = L2_VTR_TRANSLATE_2_2;
+ *vtr_tag1 = clib_host_to_net_u16 (in_config->tags[0].priority_cfi_and_id);
+ *vtr_tag2 = clib_host_to_net_u16 (in_config->tags[1].priority_cfi_and_id);
+ *push_dot1q = (ETHERNET_TYPE_VLAN == clib_host_to_net_u16 (in_config->tags[0].type));
+ break;
+ default:
+ clib_warning ("invalid push_bytes count: %d", in_config->push_bytes);
+ error = VNET_API_ERROR_UNEXPECTED_INTF_STATE;
+ goto done;
+ }
+ break;
+
+ default:
+ clib_warning ("invalid pop_bytes count: %d", in_config->pop_bytes);
+ error = VNET_API_ERROR_UNEXPECTED_INTF_STATE;
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+// set subinterface vtr enable/disable
+// The CLI format is:
+// set interface l2 tag-rewrite <interface> [disable | pop 1 | pop 2 | push {dot1q|dot1ad} <tag> [<tag>]]
+// "push" can also be replaced by "translate-{1|2}-{1|2}"
+static clib_error_t *
+int_l2_vtr (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error = 0;
+ u32 sw_if_index;
+ u32 vtr_op;
+ u32 push_dot1q = 0;
+ u32 tag1 = 0, tag2 = 0;
+
+ if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ error = clib_error_return (0, "unknown interface `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ vtr_op = L2_VTR_DISABLED;
+
+ if (unformat (input, "disable")) {
+ vtr_op = L2_VTR_DISABLED;
+ } else if (unformat (input, "pop 1")) {
+ vtr_op = L2_VTR_POP_1;
+ } else if (unformat (input, "pop 2")) {
+ vtr_op = L2_VTR_POP_2;
+
+ } else if (unformat (input, "push dot1q %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_PUSH_2;
+ push_dot1q = 1;
+ } else if (unformat (input, "push dot1ad %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_PUSH_2;
+
+ } else if (unformat (input, "push dot1q %d", &tag1)) {
+ vtr_op = L2_VTR_PUSH_1;
+ push_dot1q = 1;
+ } else if (unformat (input, "push dot1ad %d", &tag1)) {
+ vtr_op = L2_VTR_PUSH_1;
+
+ } else if (unformat (input, "translate 1-1 dot1q %d", &tag1)) {
+ vtr_op = L2_VTR_TRANSLATE_1_1;
+ push_dot1q = 1;
+ } else if (unformat (input, "translate 1-1 dot1ad %d", &tag1)) {
+ vtr_op = L2_VTR_TRANSLATE_1_1;
+
+ } else if (unformat (input, "translate 2-1 dot1q %d", &tag1)) {
+ vtr_op = L2_VTR_TRANSLATE_2_1;
+ push_dot1q = 1;
+ } else if (unformat (input, "translate 2-1 dot1ad %d", &tag1)) {
+ vtr_op = L2_VTR_TRANSLATE_2_1;
+
+ } else if (unformat (input, "translate 2-2 dot1q %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_TRANSLATE_2_2;
+ push_dot1q = 1;
+ } else if (unformat (input, "translate 2-2 dot1ad %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_TRANSLATE_2_2;
+
+ } else if (unformat (input, "translate 1-2 dot1q %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_TRANSLATE_1_2;
+ push_dot1q = 1;
+ } else if (unformat (input, "translate 1-2 dot1ad %d %d", &tag1, &tag2)) {
+ vtr_op = L2_VTR_TRANSLATE_1_2;
+
+ } else {
+ error = clib_error_return (0, "expecting [disable | pop 1 | pop 2 | push {dot1q|dot1ah} <tag> [<tag>]\n"
+ " | translate {1|2}-{1|2} {dot1q|dot1ah} <tag> [<tag>]] but got `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+
+ if (l2vtr_configure (vm,
+ vnm,
+ sw_if_index,
+ vtr_op,
+ push_dot1q,
+ tag1,
+ tag2)) {
+ error = clib_error_return (0, "vlan tag rewrite is not compatible with interface");
+ goto done;
+ }
+
+ done:
+ return error;
+}
+
+VLIB_CLI_COMMAND (int_l2_vtr_cli, static) = {
+ .path = "set interface l2 tag-rewrite",
+ .short_help = "set interface l2 tag-rewrite <interface> [disable | pop {1|2} | push {dot1q|dot1ad} <tag> <tag>]",
+ .function = int_l2_vtr,
+};
+
diff --git a/vnet/vnet/l2/l2_vtr.h b/vnet/vnet/l2/l2_vtr.h
new file mode 100644
index 00000000000..aef6c6d255e
--- /dev/null
+++ b/vnet/vnet/l2/l2_vtr.h
@@ -0,0 +1,167 @@
+/*
+ * l2_vtr.h : layer 2 vlan tag rewrite processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vnet_l2_vtr_h
+#define included_vnet_l2_vtr_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/ethernet/packet.h>
+#include <vnet/l2/l2_vtr.h>
+
+// VTR config options for API and CLI support
+typedef enum {
+ L2_VTR_DISABLED,
+ L2_VTR_PUSH_1,
+ L2_VTR_PUSH_2,
+ L2_VTR_POP_1,
+ L2_VTR_POP_2,
+ L2_VTR_TRANSLATE_1_1,
+ L2_VTR_TRANSLATE_1_2,
+ L2_VTR_TRANSLATE_2_1,
+ L2_VTR_TRANSLATE_2_2
+} l2_vtr_op_t;
+
+// Per-interface vlan tag rewrite configuration
+// There will be one instance of this struct for each sw_if_index
+// for both input vtr and output vtr
+typedef struct {
+ union {
+ // Up to two vlan tags to push.
+ // if there is only one vlan tag to push, it is in tags[1].
+ ethernet_vlan_header_tv_t tags[2];
+ u64 raw_tags;
+ };
+
+ union {
+ struct {
+ u8 push_bytes; // number of bytes to push for up to 2 vlans (0,4,8)
+ u8 pop_bytes; // number of bytes to pop for up to 2 vlans (0,4,8)
+ };
+ u16 push_and_pop_bytes; // if 0 then the feature is disabled
+ };
+} vtr_config_t;
+
+
+// Perform the configured tag rewrite on the packet.
+// Return 0 if ok, 1 if packet should be dropped (e.g. tried to pop too many tags)
+always_inline u32
+l2_vtr_process (vlib_buffer_t * b0,
+ vtr_config_t * config)
+{
+ u64 temp_8;
+ u32 temp_4;
+ u8 * eth;
+
+ eth = vlib_buffer_get_current (b0);
+
+ // copy the 12B dmac and smac to a temporary location
+ temp_8 = *((u64 *)eth);
+ temp_4 = *((u32 *)(eth+8));
+
+ // adjust for popped tags
+ eth += config->pop_bytes;
+
+ // if not enough tags to pop then drop packet
+ if (PREDICT_FALSE ((vnet_buffer(b0)->l2.l2_len - 12) < config->pop_bytes)) {
+ return 1;
+ }
+
+ // copy the 2 new tags to the start of the packet
+ *((u64 *)(eth + 12 - 8)) = config->raw_tags;
+
+ // TODO: set cos bits
+
+ // adjust for pushed tags:
+ eth -= config->push_bytes;
+
+ // copy the 12 dmac and smac back to the packet
+ *((u64 *)eth) = temp_8;
+ *((u32 *)(eth+8)) = temp_4;
+
+ // Update l2_len
+ vnet_buffer(b0)->l2.l2_len += (word)config->push_bytes - (word)config->pop_bytes;
+
+ // Update packet len
+ vlib_buffer_advance(b0, (word)config->pop_bytes - (word)config->push_bytes);
+
+ return 0;
+}
+
+
+// Perform the egress pre-vlan tag rewrite EFP Filter check. The post-vlan tag rewrite
+// check is a separate graph node.
+//
+// This check insures that a packet being output to an interface (before output vtr
+// is performed) has vlan tags that match those on a packet received from that
+// interface (after vtr has been performed).
+// This means verifying that any tags pushed by input vtr are present on the packet.
+//
+// Return 0 if ok, 1 if packet should be dropped.
+// This function should be passed the input vtr config for the interface.
+always_inline u8
+l2_efp_filter_process (vlib_buffer_t * b0,
+ vtr_config_t * in_config)
+{
+ u8 * eth;
+ u64 packet_tags;
+ u64 tag_mask;
+
+ eth = vlib_buffer_get_current (b0);
+
+ // If there are 2 tags pushed, they must match config->tags[0] and config->tags[1].
+ // If there is one tag pushed, it must match config->tag[1].
+ // If there are 0 tags pushed, the check passes.
+
+ // mask for two vlan id and ethertypes, no cos bits
+ tag_mask = clib_net_to_host_u64(0xFFFF0FFFFFFF0FFF);
+ // mask for one vlan id and ethertype, no cos bits
+ tag_mask = (in_config->push_bytes == 4) ? clib_net_to_host_u64(0xFFFF0FFF) : tag_mask;
+ // mask for always match
+ tag_mask = (in_config->push_bytes == 0) ? 0 : tag_mask;
+
+ // Read 8B from the packet, getting the proper set of vlan tags
+ // For 0 push bytes, the address doesn't matter since the mask clears the data to 0.
+ packet_tags = *((u64 *)(eth + 4 + in_config->push_bytes));
+
+ // Check if the packet tags match the configured tags
+ return (packet_tags & tag_mask) != in_config->raw_tags;
+}
+
+
+// Configure vtag tag rewrite on the given interface.
+// Return 1 if there is an error, 0 if ok
+u32 l2vtr_configure(vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 sw_if_index,
+ u32 vtr_op,
+ u32 push_dot1q,
+ u32 vtr_tag1,
+ u32 vtr_tag2);
+
+// Get vtag tag rewrite on the given interface.
+// Return 1 if there is an error, 0 if ok
+u32 l2vtr_get (vlib_main_t * vlib_main,
+ vnet_main_t * vnet_main,
+ u32 sw_if_index,
+ u32 *vtr_op,
+ u32 *push_dot1q,
+ u32 *vtr_tag1,
+ u32 *vtr_tag2);
+
+#endif // included_vnet_l2_vtr_h
+
diff --git a/vnet/vnet/l2/l2_xcrw.c b/vnet/vnet/l2/l2_xcrw.c
new file mode 100644
index 00000000000..f5fe3ca14e4
--- /dev/null
+++ b/vnet/vnet/l2/l2_xcrw.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/l2/l2_xcrw.h>
+
+/*
+ * General L2 / L3 cross-connect, used to set up
+ * "L2 interface <--> your-favorite-tunnel-encap" tunnels.
+ *
+ * We set up a typical L2 cross-connect or (future) bridge
+ * to hook L2 interface(s) up to the L3 stack in arbitrary ways.
+ *
+ * Each l2_xcrw adjacency specifies 3 things:
+ *
+ * 1. The next graph node (presumably in the L3 stack) to
+ * process the (L2 -> L3) packet
+ *
+ * 2. A new value for vnet_buffer(b)->sw_if_index[VLIB_TX]
+ * (i.e. a lookup FIB index),
+ *
+ * 3. A rewrite string to apply.
+ *
+ * Example: to cross-connect an L2 interface or (future) bridge
+ * to an mpls-o-gre tunnel, set up the L2 rewrite string as shown in
+ * mpls_gre_rewrite, and use "mpls-post-rewrite" to fix the
+ * GRE IP header checksum and length fields.
+ */
+
+typedef struct {
+ u32 next_index;
+ u32 tx_fib_index;
+} l2_xcrw_trace_t;
+
+/* packet trace format function */
+static u8 * format_l2_xcrw_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2_xcrw_trace_t * t = va_arg (*args, l2_xcrw_trace_t *);
+
+ s = format (s, "L2_XCRW: next index %d tx_fib_index %d",
+ t->next_index, t->tx_fib_index);
+ return s;
+}
+
+l2_xcrw_main_t l2_xcrw_main;
+
+static vlib_node_registration_t l2_xcrw_node;
+
+static char * l2_xcrw_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2_xcrw_error
+#undef _
+};
+
+static uword
+l2_xcrw_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ l2_xcrw_next_t next_index;
+ l2_xcrw_main_t * xcm = &l2_xcrw_main;
+ vlib_node_t *n = vlib_get_node (vm, l2_xcrw_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ l2_xcrw_adjacency_t * adj0, * adj1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ adj0 = vec_elt_at_index (xcm->adj_by_sw_if_index, sw_if_index0);
+ adj1 = vec_elt_at_index (xcm->adj_by_sw_if_index, sw_if_index1);
+
+ next0 = adj0->rewrite_header.next_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ adj0->rewrite_header.sw_if_index;
+
+ next1 = adj1->rewrite_header.next_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] =
+ adj1->rewrite_header.sw_if_index;
+
+ em->counters[node_counter_base_index + next1]++;
+
+ if (PREDICT_TRUE(next0 > 0))
+ {
+ u8 * h0 = vlib_buffer_get_current (b0);
+ vnet_rewrite_one_header (adj0[0], h0,
+ adj0->rewrite_header.data_bytes);
+ vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
+ em->counters[node_counter_base_index + L2_XCRW_ERROR_FWD]++;
+ }
+
+ if (PREDICT_TRUE(next1 > 0))
+ {
+ u8 * h1 = vlib_buffer_get_current (b1);
+ vnet_rewrite_one_header (adj1[0], h1,
+ adj1->rewrite_header.data_bytes);
+ vlib_buffer_advance (b1, -adj1->rewrite_header.data_bytes);
+ em->counters[node_counter_base_index + L2_XCRW_ERROR_FWD]++;
+ }
+
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2_xcrw_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ t->tx_fib_index = adj0->rewrite_header.sw_if_index;
+ }
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b1->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2_xcrw_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->next_index = next1;
+ t->tx_fib_index = adj1->rewrite_header.sw_if_index;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ l2_xcrw_adjacency_t * adj0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+
+ adj0 = vec_elt_at_index (xcm->adj_by_sw_if_index, sw_if_index0);
+
+ next0 = adj0->rewrite_header.next_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ adj0->rewrite_header.sw_if_index;
+
+ if (PREDICT_TRUE(next0 > 0))
+ {
+ u8 *h0 = vlib_buffer_get_current (b0);
+ vnet_rewrite_one_header (adj0[0], h0,
+ adj0->rewrite_header.data_bytes);
+ vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes);
+ em->counters[node_counter_base_index + L2_XCRW_ERROR_FWD]++;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ l2_xcrw_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ t->tx_fib_index = adj0->rewrite_header.sw_if_index;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (l2_xcrw_node, static) = {
+ .function = l2_xcrw_node_fn,
+ .name = "l2-xcrw",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2_xcrw_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2_xcrw_error_strings),
+ .error_strings = l2_xcrw_error_strings,
+
+ .n_next_nodes = L2_XCRW_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2_XCRW_NEXT_DROP] = "error-drop",
+ },
+};
+
+clib_error_t *l2_xcrw_init (vlib_main_t *vm)
+{
+ l2_xcrw_main_t * mp = &l2_xcrw_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = &vnet_main;
+ mp->tunnel_index_by_l2_sw_if_index = hash_create (0, sizeof(uword));
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (l2_xcrw_init);
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+static u8 * format_xcrw_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "xcrw%d", dev_instance);
+}
+
+VNET_DEVICE_CLASS (xcrw_device_class,static) = {
+ .name = "Xcrw",
+ .format_device_name = format_xcrw_name,
+ .tx_function = dummy_interface_tx,
+};
+
+/* Create a sham tunnel interface and return its sw_if_index */
+static u32
+create_xcrw_interface (vlib_main_t * vm)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ static u32 instance;
+ u8 address[6];
+ u32 hw_if_index;
+ vnet_hw_interface_t * hi;
+ u32 sw_if_index;
+
+ /* mac address doesn't really matter */
+ memset (address, 0, sizeof (address));
+ address[2] = 0x12;
+
+ /* can returns error iff phy != 0 */
+ (void) ethernet_register_interface
+ (vnm,
+ xcrw_device_class.index,
+ instance++,
+ address,
+ &hw_if_index,
+ /* flag change */ 0);
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ sw_if_index = hi->sw_if_index;
+ vnet_sw_interface_set_flags (vnm, sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ /* Output to the sham tunnel invokes the encap node */
+ hi->output_node_index = l2_xcrw_node.index;
+
+ return sw_if_index;
+}
+
+int vnet_configure_l2_xcrw (vlib_main_t * vm, vnet_main_t *vnm,
+ u32 l2_sw_if_index, u32 tx_fib_index,
+ u8 * rewrite, u32 next_node_index, int is_add)
+{
+ l2_xcrw_main_t * xcm = &l2_xcrw_main;
+ l2_xcrw_adjacency_t * a;
+ l2_xcrw_tunnel_t * t;
+ uword * p;
+
+ if (is_add)
+ {
+
+ pool_get (xcm->tunnels, t);
+
+ /* No interface allocated? Do it. Otherwise, set admin up */
+ if (t->tunnel_sw_if_index == 0)
+ t->tunnel_sw_if_index = create_xcrw_interface (vm);
+ else
+ vnet_sw_interface_set_flags (vnm, t->tunnel_sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ t->l2_sw_if_index = l2_sw_if_index;
+
+ vec_validate (xcm->adj_by_sw_if_index, t->l2_sw_if_index);
+
+ a = vec_elt_at_index (xcm->adj_by_sw_if_index, t->l2_sw_if_index);
+ memset (a, 0, sizeof (*a));
+
+ a->rewrite_header.sw_if_index = tx_fib_index;
+
+ /*
+ * Add or find a dynamic disposition for the successor node,
+ * e.g. so we can ship pkts to mpls_post_rewrite...
+ */
+ a->rewrite_header.next_index =
+ vlib_node_add_next (vm, l2_xcrw_node.index, next_node_index);
+
+ if (vec_len (rewrite))
+ vnet_rewrite_set_data (a[0], rewrite, vec_len(rewrite));
+
+ set_int_l2_mode (vm, vnm, MODE_L2_XC, t->l2_sw_if_index, 0, 0, 0,
+ t->tunnel_sw_if_index);
+ hash_set (xcm->tunnel_index_by_l2_sw_if_index,
+ t->l2_sw_if_index, t - xcm->tunnels);
+ return 0;
+ }
+ else
+ {
+ p = hash_get (xcm->tunnel_index_by_l2_sw_if_index, l2_sw_if_index);
+ if (p == 0)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ t = pool_elt_at_index (xcm->tunnels, p[0]);
+
+ a = vec_elt_at_index (xcm->adj_by_sw_if_index, t->l2_sw_if_index);
+ /* Reset adj to drop traffic */
+ memset (a, 0, sizeof (*a));
+
+ set_int_l2_mode (vm, vnm, MODE_L3, t->l2_sw_if_index, 0, 0, 0, 0);
+
+ vnet_sw_interface_set_flags (vnm, t->tunnel_sw_if_index, 0 /* down */);
+
+ hash_unset (xcm->tunnel_index_by_l2_sw_if_index, l2_sw_if_index);
+ pool_put (xcm->tunnels, t);
+ }
+ return 0;
+}
+
+
+static clib_error_t *
+set_l2_xcrw_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ int is_add = 1;
+ int is_ipv6 = 0; /* for fib id -> fib index mapping */
+ u32 tx_fib_id = ~0;
+ u32 tx_fib_index = ~0;
+ u32 next_node_index = ~0;
+ u32 l2_sw_if_index;
+ u8 * rw = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ int rv;
+
+
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ if (! unformat (line_input, "%U",
+ unformat_vnet_sw_interface, vnm, &l2_sw_if_index))
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, line_input);
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "next %U",
+ unformat_vlib_node, vm, &next_node_index))
+ ;
+ else if (unformat (line_input, "tx-fib-id %d", &tx_fib_id))
+ ;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "ipv6"))
+ is_ipv6 = 1;
+ else if (unformat (line_input, "rw %U",
+ unformat_hex_string, &rw));
+ else
+ break;
+ }
+
+ if (next_node_index == ~0)
+ return clib_error_return (0, "next node not specified");
+
+ if (tx_fib_id != ~0)
+ {
+ uword * p;
+
+ if (is_ipv6)
+ p = hash_get (ip6_main.fib_index_by_table_id, tx_fib_id);
+ else
+ p = hash_get (ip4_main.fib_index_by_table_id, tx_fib_id);
+
+ if (p == 0)
+ return clib_error_return (0, "nonexistent tx_fib_id %d",
+ tx_fib_id);
+
+ tx_fib_index = p[0];
+ }
+
+ rv = vnet_configure_l2_xcrw (vm, vnm, l2_sw_if_index, tx_fib_index,
+ rw, next_node_index, is_add);
+
+ switch (rv)
+ {
+
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX:
+ return clib_error_return (0, "%U not cross-connected",
+ format_vnet_sw_if_index_name,
+ vnm, l2_sw_if_index);
+ default:
+ return clib_error_return (0, "vnet_configure_l2_xcrw returned %d",
+ rv);
+ }
+
+ vec_free (rw);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_l2_xcrw_command, static) = {
+ .path = "set interface l2 xcrw",
+ .short_help =
+ "set int l2 xcrw <interface> next <node-name>\n"
+ " [del] [tx-fib-id <id>] [ipv6] rw <hex-bytes>",
+ .function = set_l2_xcrw_command_fn,
+};
+
+static u8 * format_l2xcrw (u8 * s, va_list * args)
+{
+ vnet_main_t * vnm = va_arg (*args, vnet_main_t *);
+ l2_xcrw_tunnel_t * t = va_arg (*args, l2_xcrw_tunnel_t *);
+ l2_xcrw_main_t * xcm = &l2_xcrw_main;
+ vlib_main_t * vm = vlib_get_main ();
+ l2_xcrw_adjacency_t * a;
+ u8 * rewrite_string;
+
+ if (t == 0)
+ {
+ s = format (s, "%-25s%s", "L2 interface", "Tunnel Details");
+ return s;
+ }
+
+ s = format (s, "%-25U %U ",
+ format_vnet_sw_if_index_name, vnm, t->l2_sw_if_index,
+ format_vnet_sw_if_index_name, vnm, t->tunnel_sw_if_index);
+
+ a = vec_elt_at_index (xcm->adj_by_sw_if_index, t->l2_sw_if_index);
+
+ s = format (s, "next %U ",
+ format_vlib_next_node_name, vm, l2_xcrw_node.index,
+ a->rewrite_header.next_index);
+
+ if (a->rewrite_header.sw_if_index != ~0)
+ s = format (s, "tx fib index %d ", a->rewrite_header.sw_if_index);
+
+ if (a->rewrite_header.data_bytes)
+ {
+ rewrite_string = (u8 *)(a + 1);
+ rewrite_string -= a->rewrite_header.data_bytes;
+ s = format (s, "rewrite data: %U ",
+ format_hex_bytes, rewrite_string,
+ a->rewrite_header.data_bytes);
+ }
+
+ s = format (s, "\n");
+
+ return s;
+}
+
+
+static clib_error_t *
+show_l2xcrw_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ l2_xcrw_main_t * xcm = &l2_xcrw_main;
+ l2_xcrw_tunnel_t * t;
+
+ if (pool_elts (xcm->tunnels) == 0)
+ {
+ vlib_cli_output (vm, "No L2 / L3 rewrite cross-connects configured");
+ return 0;
+ }
+
+ vlib_cli_output (vm, "%U", format_l2xcrw, 0, 0);
+
+ pool_foreach (t, xcm->tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_l2xcrw, vnm, t);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_l2xcrw_command, static) = {
+ .path = "show l2xcrw",
+ .short_help = "Display L2/L3 rewrite cross-connects",
+ .function = show_l2xcrw_command_fn,
+};
diff --git a/vnet/vnet/l2/l2_xcrw.h b/vnet/vnet/l2/l2_xcrw.h
new file mode 100644
index 00000000000..d32d1e8df5c
--- /dev/null
+++ b/vnet/vnet/l2/l2_xcrw.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_l2_xcrw_h__
+#define __included_l2_xcrw_h__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/l2_output.h>
+#include <vnet/api_errno.h>
+#include <vnet/ethernet/ethernet.h>
+
+typedef struct {
+ /*
+ * Let: rewrite_header.sw_if_index = tx_fib_index or ~0.
+ * rewrite_header.next_index = L2_XCRW_NEXT_XXX
+ */
+ vnet_declare_rewrite (VLIB_BUFFER_PRE_DATA_SIZE);
+} l2_xcrw_adjacency_t;
+
+typedef struct {
+ /* L2 interface */
+ u32 l2_sw_if_index;
+
+ /* Tunnel interface */
+ u32 tunnel_sw_if_index; /* This field remains set in freed pool elts */
+
+} l2_xcrw_tunnel_t;
+
+typedef struct {
+ u32 cached_next_index;
+
+ /* Vector of cross-connect rewrites */
+ l2_xcrw_adjacency_t * adj_by_sw_if_index;
+
+ /* Pool of xcrw tunnels */
+ l2_xcrw_tunnel_t * tunnels;
+
+ /* Tunnel index by tunnel sw_if_index */
+ uword * tunnel_index_by_l2_sw_if_index;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} l2_xcrw_main_t;
+
+typedef enum {
+ L2_XCRW_NEXT_DROP,
+ L2_XCRW_N_NEXT,
+} l2_xcrw_next_t;
+
+#define foreach_l2_xcrw_error \
+_(DROP, "Packets dropped") \
+_(FWD, "Packets forwarded")
+
+typedef enum {
+#define _(sym,str) L2_XCRW_ERROR_##sym,
+ foreach_l2_xcrw_error
+#undef _
+ L2_XCRW_N_ERROR,
+} l2_xcrw_error_t;
+
+#endif /* __included_l2_xcrw_h__ */
diff --git a/vnet/vnet/l2tp/decap.c b/vnet/vnet/l2tp/decap.c
new file mode 100644
index 00000000000..1a2bc4890d0
--- /dev/null
+++ b/vnet/vnet/l2tp/decap.c
@@ -0,0 +1,253 @@
+/*
+ * decap.c : L2TPv3 tunnel decapsulation
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/l2tp/l2tp.h>
+
+/* Statistics (not really errors) */
+#define foreach_l2t_decap_error \
+_(USER_TO_NETWORK, "L2TP user (ip6) to L2 network pkts") \
+_(SESSION_ID_MISMATCH, "l2tpv3 local session id mismatches") \
+_(COOKIE_MISMATCH, "l2tpv3 local cookie mismatches")
+
+static char * l2t_decap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2t_decap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) L2T_DECAP_ERROR_##sym,
+ foreach_l2t_decap_error
+#undef _
+ L2T_DECAP_N_ERROR,
+} l2t_DECAP_error_t;
+
+typedef enum {
+ L2T_DECAP_NEXT_DROP,
+ L2T_DECAP_NEXT_L2_INPUT,
+ L2T_DECAP_N_NEXT,
+ /* Pseudo next index */
+ L2T_DECAP_NEXT_NO_INTERCEPT = L2T_DECAP_N_NEXT,
+} l2t_decap_next_t;
+
+#define NSTAGES 3
+
+static inline void stage0 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 buffer_index)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+ vlib_prefetch_buffer_header (b, STORE);
+ /* l2tpv3 header is a long way away, need 2 cache lines */
+ CLIB_PREFETCH (b->data, 2*CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+static inline void stage1 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 bi)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ l2t_main_t *lm = &l2t_main;
+ ip6_header_t * ip6 = vlib_buffer_get_current (b);
+ u32 session_index;
+ uword *p = 0;
+ l2tpv3_header_t * l2t;
+
+ /* Not L2tpv3 (0x73, 0t115)? Use the normal path. */
+ if (PREDICT_FALSE(ip6->protocol != IP_PROTOCOL_L2TP)) {
+ vnet_buffer(b)->l2t.next_index = L2T_DECAP_NEXT_NO_INTERCEPT;
+ return;
+ }
+
+ /* Make up your minds, people... */
+ switch (lm->lookup_type) {
+ case L2T_LOOKUP_SRC_ADDRESS:
+ p = hash_get_mem (lm->session_by_src_address, &ip6->src_address);
+ break;
+ case L2T_LOOKUP_DST_ADDRESS:
+ p = hash_get_mem (lm->session_by_dst_address, &ip6->dst_address);
+ break;
+ case L2T_LOOKUP_SESSION_ID:
+ l2t = (l2tpv3_header_t*)(ip6+1);
+ p = hash_get (lm->session_by_session_id, l2t->session_id);
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ if (PREDICT_FALSE(p == 0)) {
+ vnet_buffer(b)->l2t.next_index = L2T_DECAP_NEXT_NO_INTERCEPT;
+ return;
+ } else {
+ session_index = p[0];
+ }
+
+ /* Remember mapping index, prefetch the mini counter */
+ vnet_buffer(b)->l2t.next_index = L2T_DECAP_NEXT_L2_INPUT;
+ vnet_buffer(b)->l2t.session_index = session_index;
+
+ /* $$$$$ prefetch counter */
+}
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ l2t_main_t *lm = &l2t_main;
+ ip6_header_t * ip6 = vlib_buffer_get_current (b);
+ vlib_node_t *n = vlib_get_node (vm, l2t_decap_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ l2tpv3_header_t * l2tp;
+ u32 counter_index;
+ l2t_session_t * session;
+ u32 session_index;
+ u32 next_index;
+
+ /* Other-than-output pkt? We're done... */
+ if (vnet_buffer(b)->l2t.next_index != L2T_DECAP_NEXT_L2_INPUT) {
+ next_index = vnet_buffer(b)->l2t.next_index;
+ goto done;
+ }
+
+ em->counters[node_counter_base_index + L2T_DECAP_ERROR_USER_TO_NETWORK] += 1;
+
+ session_index = vnet_buffer(b)->l2t.session_index;
+
+ counter_index =
+ session_index_to_counter_index (session_index,
+ SESSION_COUNTER_USER_TO_NETWORK);
+
+ /* per-mapping byte stats include the ethernet header */
+ vlib_increment_combined_counter (&lm->counter_main,
+ os_get_cpu_number(),
+ counter_index,
+ 1 /* packet_increment */,
+ vlib_buffer_length_in_chain (vm, b) +
+ sizeof (ethernet_header_t));
+
+ session = pool_elt_at_index (lm->sessions, session_index);
+
+ l2tp = vlib_buffer_get_current (b) + sizeof (*ip6);
+
+ if (PREDICT_FALSE(l2tp->session_id != session->local_session_id)) {
+ // Key matched but session id does not. Assume packet is not for us.
+ em->counters[node_counter_base_index + L2T_DECAP_ERROR_SESSION_ID_MISMATCH] += 1;
+ next_index = L2T_DECAP_NEXT_NO_INTERCEPT;
+ goto done;
+ }
+
+ if (PREDICT_FALSE (l2tp->cookie != session->local_cookie[0])) {
+ if (l2tp->cookie != session->local_cookie[1]) {
+ // Key and session ID matched, but cookie doesn't. Drop this packet.
+ b->error = node->errors[L2T_DECAP_ERROR_COOKIE_MISMATCH];
+ next_index = L2T_DECAP_NEXT_DROP;
+ goto done;
+ }
+ }
+
+ vnet_buffer(b)->sw_if_index[VLIB_RX] = session->sw_if_index;
+
+ /* strip the ip6 and L2TP header */
+ vlib_buffer_advance (b, sizeof (*ip6) + session->l2tp_hdr_size);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b);
+
+ if (PREDICT_FALSE(b->flags & VLIB_BUFFER_IS_TRACED)) {
+ l2t_trace_t *t = vlib_add_trace (vm, node, b, sizeof (*t));
+ t->is_user_to_network = 1;
+ t->our_address.as_u64[0] =
+ ip6->dst_address.as_u64[0];
+ t->our_address.as_u64[1] =
+ ip6->dst_address.as_u64[1];
+ t->client_address.as_u64[0] =
+ ip6->src_address.as_u64[0];
+ t->client_address.as_u64[1] =
+ ip6->src_address.as_u64[1];
+ t->session_index = session_index;
+ }
+
+ return L2T_DECAP_NEXT_L2_INPUT;
+
+ done:
+ if (next_index == L2T_DECAP_NEXT_NO_INTERCEPT) {
+ // Go to next node on the ip6 configuration chain
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_config_main_t * cm = &lm->rx_config_mains[VNET_UNICAST];
+ ip6_l2tpv3_config_t * c0;
+
+ vnet_get_config_data (&cm->config_main,
+ &vnet_buffer (b)->ip.current_config_index,
+ &next_index,
+ sizeof (c0[0]));
+ }
+
+ if (PREDICT_FALSE(b->flags & VLIB_BUFFER_IS_TRACED)) {
+ l2t_trace_t *t = vlib_add_trace (vm, node, b, sizeof (*t));
+ t->is_user_to_network = 1;
+ t->our_address.as_u64[0] =
+ ip6->dst_address.as_u64[0];
+ t->our_address.as_u64[1] =
+ ip6->dst_address.as_u64[1];
+ t->client_address.as_u64[0] =
+ ip6->src_address.as_u64[0];
+ t->client_address.as_u64[1] =
+ ip6->src_address.as_u64[1];
+ t->session_index = ~0;
+ }
+ return next_index;
+}
+
+#include <vnet/pipeline.h>
+
+static uword l2t_decap_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+VLIB_REGISTER_NODE (l2t_decap_node) = {
+ .function = l2t_decap_node_fn,
+ .name = "l2tp-decap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2t_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(l2t_decap_error_strings),
+ .error_strings = l2t_decap_error_strings,
+
+ .n_next_nodes = L2T_DECAP_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [L2T_DECAP_NEXT_L2_INPUT] = "l2-input",
+ [L2T_DECAP_NEXT_DROP] = "error-drop",
+ },
+};
+
+void l2tp_decap_init (void)
+{
+ ip6_register_protocol (IP_PROTOCOL_L2TP, l2t_decap_node.index);
+}
diff --git a/vnet/vnet/l2tp/encap.c b/vnet/vnet/l2tp/encap.c
new file mode 100644
index 00000000000..8f26ab007dc
--- /dev/null
+++ b/vnet/vnet/l2tp/encap.c
@@ -0,0 +1,217 @@
+/*
+ * encap.c : L2TPv3 tunnel encapsulation
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/l2tp/l2tp.h>
+
+/* Statistics (not really errors) */
+#define foreach_l2t_encap_error \
+_(NETWORK_TO_USER, "L2TP L2 network to user (ip6) pkts") \
+_(LOOKUP_FAIL_TO_L3, "L2TP L2 session lookup failed pkts")
+
+static char * l2t_encap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_l2t_encap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) L2T_ENCAP_ERROR_##sym,
+ foreach_l2t_encap_error
+#undef _
+ L2T_ENCAP_N_ERROR,
+} l2t_encap_error_t;
+
+
+typedef enum {
+ L2T_ENCAP_NEXT_DROP,
+ L2T_ENCAP_NEXT_IP6_LOOKUP,
+ L2T_ENCAP_N_NEXT,
+} l2t_encap_next_t;
+
+typedef struct {
+ u32 cached_session_index;
+ u32 cached_sw_if_index;
+ vnet_main_t * vnet_main;
+} l2tp_encap_runtime_t;
+
+vlib_node_registration_t l2t_encap_node;
+
+#define NSTAGES 3
+
+static inline void stage0 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 buffer_index)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+ vlib_prefetch_buffer_header (b, STORE);
+ CLIB_PREFETCH (b->data, 2*CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+static inline void stage1 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 bi)
+{
+ l2tp_encap_runtime_t * rt = (void *) node->runtime_data;
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ vnet_hw_interface_t * hi;
+
+ u32 sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_TX];
+ u32 session_index = rt->cached_session_index;
+
+ if (PREDICT_FALSE(rt->cached_sw_if_index != sw_if_index))
+ {
+ hi = vnet_get_sup_hw_interface (rt->vnet_main, sw_if_index);
+ session_index = rt->cached_session_index = hi->dev_instance;
+ rt->cached_sw_if_index = sw_if_index;
+ }
+
+ /* Remember mapping index, prefetch the mini counter */
+ vnet_buffer(b)->l2t.next_index = L2T_ENCAP_NEXT_IP6_LOOKUP;
+ vnet_buffer(b)->l2t.session_index = session_index;
+
+ /* $$$$ prefetch counter... */
+}
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+ l2t_main_t *lm = &l2t_main;
+ vlib_node_t *n = vlib_get_node (vm, l2t_encap_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ l2tpv3_header_t * l2tp;
+ u32 session_index;
+ u32 counter_index;
+ l2t_session_t *s;
+ ip6_header_t *ip6;
+ u16 payload_length;
+
+ /* Other-than-output pkt? We're done... */
+ if (vnet_buffer(b)->l2t.next_index != L2T_ENCAP_NEXT_IP6_LOOKUP)
+ return vnet_buffer(b)->l2t.next_index;
+
+ /* clear so it is not interpreted as fib_index */
+ vnet_buffer(b)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ em->counters[node_counter_base_index + L2T_ENCAP_ERROR_NETWORK_TO_USER] += 1;
+
+ session_index = vnet_buffer(b)->l2t.session_index;
+
+ counter_index =
+ session_index_to_counter_index (session_index,
+ SESSION_COUNTER_NETWORK_TO_USER);
+
+ /* per-mapping byte stats include the ethernet header */
+ vlib_increment_combined_counter (&lm->counter_main,
+ os_get_cpu_number(),
+ counter_index,
+ 1 /* packet_increment */,
+ vlib_buffer_length_in_chain (vm, b));
+
+ s = pool_elt_at_index (lm->sessions, session_index);
+
+ /* Paint on an l2tpv3 hdr */
+ vlib_buffer_advance (b, -(s->l2tp_hdr_size));
+ l2tp = vlib_buffer_get_current (b);
+
+ l2tp->session_id = s->remote_session_id;
+ l2tp->cookie = s->remote_cookie;
+ if (PREDICT_FALSE (s->l2_sublayer_present)) {
+ l2tp->l2_specific_sublayer = 0;
+ }
+
+ /* Paint on an ip6 header */
+ vlib_buffer_advance (b, -(sizeof (*ip6)));
+ ip6 = vlib_buffer_get_current (b);
+
+ ip6->ip_version_traffic_class_and_flow_label =
+ clib_host_to_net_u32 (0x6<<28);
+
+ /* calculate ip6 payload length */
+ payload_length = vlib_buffer_length_in_chain (vm, b);
+ payload_length -= sizeof (*ip6);
+
+ ip6->payload_length = clib_host_to_net_u16 (payload_length);
+ ip6->protocol = IP_PROTOCOL_L2TP;
+ ip6->hop_limit = 0xff;
+ ip6->src_address.as_u64[0] = s->our_address.as_u64[0];
+ ip6->src_address.as_u64[1] = s->our_address.as_u64[1];
+ ip6->dst_address.as_u64[0] = s->client_address.as_u64[0];
+ ip6->dst_address.as_u64[1] = s->client_address.as_u64[1];
+
+ if (PREDICT_FALSE(b->flags & VLIB_BUFFER_IS_TRACED)) {
+ l2t_trace_t *t = vlib_add_trace (vm, node, b, sizeof (*t));
+ t->is_user_to_network = 0;
+ t->our_address.as_u64[0] =
+ ip6->src_address.as_u64[0];
+ t->our_address.as_u64[1] =
+ ip6->src_address.as_u64[1];
+ t->client_address.as_u64[0] =
+ ip6->dst_address.as_u64[0];
+ t->client_address.as_u64[1] =
+ ip6->dst_address.as_u64[1];
+ t->session_index = session_index;
+ }
+
+ return L2T_ENCAP_NEXT_IP6_LOOKUP;
+}
+
+#include <vnet/pipeline.h>
+
+uword l2t_encap_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (l2t_encap_node) = {
+ .function = l2t_encap_node_fn,
+ .name = "l2tp-encap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_l2t_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .runtime_data_bytes = sizeof (l2tp_encap_runtime_t),
+
+ .n_errors = ARRAY_LEN(l2t_encap_error_strings),
+ .error_strings = l2t_encap_error_strings,
+
+ .n_next_nodes = L2T_ENCAP_N_NEXT,
+
+ // add dispositions here
+ .next_nodes = {
+ [L2T_ENCAP_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [L2T_ENCAP_NEXT_DROP] = "error-drop",
+ },
+};
+
+void l2tp_encap_init (vlib_main_t * vm)
+{
+ l2tp_encap_runtime_t * rt;
+
+ rt = vlib_node_get_runtime_data (vm, l2t_encap_node.index);
+ rt->vnet_main = vnet_get_main();
+ rt->cached_sw_if_index = (u32) ~0;
+ rt->cached_session_index = (u32) ~0;
+}
diff --git a/vnet/vnet/l2tp/l2tp.c b/vnet/vnet/l2tp/l2tp.c
new file mode 100644
index 00000000000..da28487dd7f
--- /dev/null
+++ b/vnet/vnet/l2tp/l2tp.c
@@ -0,0 +1,696 @@
+/*
+ * l2tp.c : L2TPv3 tunnel support
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/l2tp/l2tp.h>
+
+l2t_main_t l2t_main;
+
+/* packet trace format function */
+u8 * format_l2t_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ l2t_trace_t * t = va_arg (*args, l2t_trace_t *);
+
+ if (t->is_user_to_network)
+ s = format (s, "L2T: %U (client) -> %U (our) session %d",
+ format_ip6_address, &t->client_address,
+ format_ip6_address, &t->our_address,
+ t->session_index);
+ else
+ s = format (s, "L2T: %U (our) -> %U (client) session %d)",
+ format_ip6_address, &t->our_address,
+ format_ip6_address, &t->client_address,
+ t->session_index);
+ return s;
+}
+
+u8 * format_l2t_session (u8 * s, va_list * args)
+{
+ l2t_session_t * session = va_arg (*args, l2t_session_t *);
+ l2t_main_t * lm = &l2t_main;
+ u32 counter_index;
+ vlib_counter_t v;
+
+ s = format (s, "[%d] %U (our) %U (client) %U (sw_if_index %d)\n",
+ session - lm->sessions,
+ format_ip6_address, &session->our_address,
+ format_ip6_address, &session->client_address,
+ format_vnet_sw_interface_name, lm->vnet_main,
+ vnet_get_sw_interface (lm->vnet_main, session->sw_if_index),
+ session->sw_if_index);
+
+ s = format (s, " local cookies %016llx %016llx remote cookie %016llx\n",
+ clib_net_to_host_u64 (session->local_cookie[0]),
+ clib_net_to_host_u64 (session->local_cookie[1]),
+ clib_net_to_host_u64 (session->remote_cookie));
+
+ s = format (s, " local session-id %d remote session-id %d\n",
+ clib_net_to_host_u32 (session->local_session_id),
+ clib_net_to_host_u32 (session->remote_session_id));
+
+ s = format (s, " l2 specific sublayer %s\n",
+ session->l2_sublayer_present ? "preset" : "absent");
+
+ counter_index =
+ session_index_to_counter_index (session - lm->sessions,
+ SESSION_COUNTER_USER_TO_NETWORK);
+
+ vlib_get_combined_counter (&lm->counter_main, counter_index, &v);
+ if (v.packets != 0)
+ s = format (s, " user-to-net: %llu pkts %llu bytes\n",
+ v.packets, v.bytes);
+
+ vlib_get_combined_counter (&lm->counter_main, counter_index+1, &v);
+
+ if (v.packets != 0)
+ s = format (s, " net-to-user: %llu pkts %llu bytes\n",
+ v.packets, v.bytes);
+ return s;
+}
+
+static clib_error_t *
+show_l2tp_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2t_session_t *session;
+ l2t_main_t *lm = &l2t_main;
+ char * keystr = 0;
+ int verbose = 0;
+
+ if (unformat (input, "verbose") || unformat (input, "v"))
+ verbose = 1;
+
+ if (pool_elts (lm->sessions) == 0)
+ vlib_cli_output (vm, "No l2tp sessions...");
+ else
+ vlib_cli_output (vm, "%u l2tp sessions...", pool_elts (lm->sessions));
+
+ if (verbose)
+ {
+ switch (lm->lookup_type)
+ {
+ case L2T_LOOKUP_SRC_ADDRESS:
+ keystr = "src address";
+ break;
+
+ case L2T_LOOKUP_DST_ADDRESS:
+ keystr = "dst address";
+ break;
+
+ case L2T_LOOKUP_SESSION_ID:
+ keystr = "session id";
+ break;
+
+ default:
+ keystr = "BOGUS!";
+ break;
+ }
+
+ vlib_cli_output (vm, "L2tp session lookup on %s", keystr);
+
+ pool_foreach (session, lm->sessions,
+ ({
+ vlib_cli_output (vm, "%U", format_l2t_session, session);
+ }));
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_session_detail_command, static) = {
+ .path = "show l2tpv3",
+ .short_help = "show l2tpv3 [verbose]",
+ .function = show_l2tp_command_fn,
+};
+
+static clib_error_t *
+test_counters_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2t_session_t *session;
+ l2t_main_t *lm = &l2t_main;
+ u32 session_index;
+ u32 counter_index;
+ u32 nincr=0;
+ u32 cpu_index = os_get_cpu_number();
+
+ pool_foreach (session, lm->sessions,
+ ({
+ session_index = session - lm->sessions;
+ counter_index =
+ session_index_to_counter_index (session_index,
+ SESSION_COUNTER_USER_TO_NETWORK);
+ vlib_increment_combined_counter (&lm->counter_main,
+ cpu_index,
+ counter_index,
+ 1/*pkt*/, 1111 /*bytes*/);
+ vlib_increment_combined_counter (&lm->counter_main,
+ cpu_index,
+ counter_index+1,
+ 1/*pkt*/, 2222 /*bytes*/);
+ nincr++;
+
+ }));
+ vlib_cli_output (vm, "Incremented %d active counters\n", nincr);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_counters_command, static) = {
+ .path = "test counters",
+ .short_help = "increment all active counters",
+ .function = test_counters_command_fn,
+};
+
+static clib_error_t *
+clear_counters_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2t_session_t *session;
+ l2t_main_t *lm = &l2t_main;
+ u32 session_index;
+ u32 counter_index;
+ u32 nincr=0;
+
+ pool_foreach (session, lm->sessions,
+ ({
+ session_index = session - lm->sessions;
+ counter_index =
+ session_index_to_counter_index (session_index,
+ SESSION_COUNTER_USER_TO_NETWORK);
+ vlib_zero_combined_counter (&lm->counter_main, counter_index);
+ vlib_zero_combined_counter (&lm->counter_main, counter_index+1);
+ nincr++;
+
+ }));
+ vlib_cli_output (vm, "Cleared %d active counters\n", nincr);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (clear_counters_command, static) = {
+ .path = "clear counters",
+ .short_help = "clear all active counters",
+ .function = clear_counters_command_fn,
+};
+
+static u8 * format_l2tpv3_name (u8 * s, va_list * args)
+{
+ l2t_main_t *lm = &l2t_main;
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+
+ if (i < vec_len (lm->dev_inst_by_real))
+ show_dev_instance = lm->dev_inst_by_real[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ return format (s, "l2tpv3_tunnel%d", i);
+}
+
+static int l2tpv3_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ l2t_main_t *lm = &l2t_main;
+
+ vec_validate_init_empty (lm->dev_inst_by_real, hi->dev_instance, ~0);
+
+ lm->dev_inst_by_real [hi->dev_instance] = new_dev_instance;
+
+ return 0;
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (l2tpv3_device_class,static) = {
+ .name = "L2TPv3",
+ .format_device_name = format_l2tpv3_name,
+ .name_renumber = l2tpv3_name_renumber,
+ .tx_function = dummy_interface_tx,
+};
+
+static uword dummy_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ /*
+ * Conundrum: packets from tun/tap destined for the tunnel
+ * actually have this rewrite applied. Transit packets do not.
+ * To make the two cases equivalent, don't generate a
+ * rewrite here, build the entire header in the fast path.
+ */
+ return 0;
+}
+
+static u8 * format_l2tp_header_with_length (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ s = format (s, "unimplemented dev %u", dev_instance);
+ return s;
+}
+
+VNET_HW_INTERFACE_CLASS (l2tpv3_hw_class) = {
+ .name = "L2TPV3",
+ .format_header = format_l2tp_header_with_length,
+ .set_rewrite = dummy_set_rewrite,
+};
+
+int create_l2tpv3_ipv6_tunnel (l2t_main_t * lm,
+ ip6_address_t * client_address,
+ ip6_address_t * our_address,
+ u32 local_session_id,
+ u32 remote_session_id,
+ u64 local_cookie,
+ u64 remote_cookie,
+ int l2_sublayer_present,
+ u32 * sw_if_index)
+{
+ l2t_session_t *s = 0;
+ vnet_main_t * vnm = lm->vnet_main;
+ vnet_hw_interface_t * hi;
+ uword * p = (uword *) ~0;
+ u32 hw_if_index;
+ l2tpv3_header_t l2tp_hdr;
+ ip6_address_t * dst_address_copy, * src_address_copy;
+ u32 counter_index;
+
+ remote_session_id = clib_host_to_net_u32 (remote_session_id);
+ local_session_id = clib_host_to_net_u32 (local_session_id);
+
+ switch (lm->lookup_type) {
+ case L2T_LOOKUP_SRC_ADDRESS:
+ p = hash_get_mem (lm->session_by_src_address, client_address);
+ break;
+
+ case L2T_LOOKUP_DST_ADDRESS:
+ p = hash_get_mem (lm->session_by_dst_address, our_address);
+ break;
+
+ case L2T_LOOKUP_SESSION_ID:
+ p = hash_get (lm->session_by_session_id, local_session_id);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ /* adding a session: session must not already exist */
+ if (p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ pool_get (lm->sessions, s);
+ memset (s, 0, sizeof (*s));
+ memcpy (&s->our_address, our_address, sizeof (s->our_address));
+ memcpy (&s->client_address, client_address, sizeof (s->client_address));
+ s->local_cookie[0] = clib_host_to_net_u64 (local_cookie);
+ s->remote_cookie = clib_host_to_net_u64 (remote_cookie);
+ s->local_session_id = local_session_id;
+ s->remote_session_id = remote_session_id;
+ s->l2_sublayer_present = l2_sublayer_present;
+ /* precompute l2tp header size */
+ s->l2tp_hdr_size = l2_sublayer_present ?
+ sizeof (l2tpv3_header_t) :
+ sizeof (l2tpv3_header_t) - sizeof(l2tp_hdr.l2_specific_sublayer);
+
+ /* Setup hash table entries */
+ switch (lm->lookup_type) {
+ case L2T_LOOKUP_SRC_ADDRESS:
+ src_address_copy = clib_mem_alloc (sizeof (*src_address_copy));
+ memcpy (src_address_copy, client_address, sizeof (*src_address_copy));
+ hash_set_mem (lm->session_by_src_address, src_address_copy,
+ s - lm->sessions);
+ break;
+ case L2T_LOOKUP_DST_ADDRESS:
+ dst_address_copy = clib_mem_alloc (sizeof (*dst_address_copy));
+ memcpy (dst_address_copy, our_address, sizeof (*dst_address_copy));
+ hash_set_mem (lm->session_by_dst_address, dst_address_copy,
+ s - lm->sessions);
+ break;
+ case L2T_LOOKUP_SESSION_ID:
+ hash_set (lm->session_by_session_id, local_session_id,
+ s - lm->sessions);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+
+ /* validate counters */
+ counter_index =
+ session_index_to_counter_index (s - lm->sessions,
+ SESSION_COUNTER_USER_TO_NETWORK);
+ vlib_validate_combined_counter (&lm->counter_main, counter_index);
+ vlib_validate_combined_counter (&lm->counter_main, counter_index+1);
+
+ if (vec_len (lm->free_l2tpv3_tunnel_hw_if_indices) > 0)
+ {
+ hw_if_index = lm->free_l2tpv3_tunnel_hw_if_indices
+ [vec_len (lm->free_l2tpv3_tunnel_hw_if_indices)-1];
+ _vec_len (lm->free_l2tpv3_tunnel_hw_if_indices) -= 1;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = s - lm->sessions;
+ hi->hw_instance = hi->dev_instance;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, l2tpv3_device_class.index, s - lm->sessions,
+ l2tpv3_hw_class.index, s - lm->sessions);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = l2t_encap_node.index;
+ /* $$$$ initialize custom dispositions, if needed */
+ }
+
+ s->hw_if_index = hw_if_index;
+ s->sw_if_index = hi->sw_if_index;
+
+ if (sw_if_index)
+ *sw_if_index = hi->sw_if_index;
+
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ return 0;
+}
+
+static clib_error_t *
+create_l2tpv3_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_address_t client_address, our_address;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ l2t_main_t *lm = &l2t_main;
+ u64 local_cookie = (u64)~0, remote_cookie = (u64)~0;
+ u32 local_session_id = 1, remote_session_id = 1;
+ int our_address_set = 0, client_address_set = 0;
+ int l2_sublayer_present = 0;
+ int rv;
+ u32 sw_if_index;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "client %U",
+ unformat_ip6_address, &client_address))
+ client_address_set = 1;
+ else if (unformat (line_input, "our %U",
+ unformat_ip6_address, &our_address))
+ our_address_set = 1;
+ else if (unformat (line_input, "local-cookie %llx", &local_cookie))
+ ;
+ else if (unformat (line_input, "remote-cookie %llx", &remote_cookie))
+ ;
+ else if (unformat (line_input, "local-session-id %d",
+ &local_session_id))
+ ;
+ else if (unformat (line_input, "remote-session-id %d",
+ &remote_session_id))
+ ;
+ else if (unformat (line_input, "l2-sublayer-present"))
+ l2_sublayer_present = 1;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (our_address_set == 0)
+ return clib_error_return (0, "our address not specified");
+ if (client_address_set == 0)
+ return clib_error_return (0, "client address not specified");
+
+ rv = create_l2tpv3_ipv6_tunnel (lm, &client_address, &our_address,
+ local_session_id, remote_session_id,
+ local_cookie, remote_cookie,
+ l2_sublayer_present,
+ &sw_if_index);
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_VALUE:
+ return clib_error_return (0, "session already exists...");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "session does not exist...");
+
+ default:
+ return clib_error_return (0, "l2tp_session_add_del returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_l2tpv3_tunnel_command, static) = {
+ .path = "create l2tpv3 tunnel",
+ .short_help =
+ "create l2tpv3 tunnel client <ip6> our <ip6> local-cookie <hex> remote-cookie <hex> local-session <dec> remote-session <dec>",
+ .function = create_l2tpv3_tunnel_command_fn,
+};
+
+int l2tpv3_set_tunnel_cookies (l2t_main_t * lm,
+ u32 sw_if_index,
+ u64 new_local_cookie,
+ u64 new_remote_cookie)
+{
+ l2t_session_t *s;
+ vnet_hw_interface_t * hi;
+ vnet_main_t * vnm = vnet_get_main();
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ if (pool_is_free_index (lm->sessions, hi->dev_instance))
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ s = pool_elt_at_index (lm->sessions, hi->dev_instance);
+
+ s->local_cookie[1] = s->local_cookie[0];
+ s->local_cookie[0] = clib_host_to_net_u64(new_local_cookie);
+ s->remote_cookie = clib_host_to_net_u64(new_remote_cookie);
+
+ return 0;
+}
+
+
+static clib_error_t *
+set_l2tp_tunnel_cookie_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ l2t_main_t *lm = &l2t_main;
+ vnet_main_t * vnm = vnet_get_main();
+ u32 sw_if_index = ~0;
+ u64 local_cookie = (u64)~0, remote_cookie = (u64)~0;
+
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_vnet_sw_interface, vnm,
+ &sw_if_index))
+ ;
+ else if (unformat (input, "local %llx", &local_cookie))
+ ;
+ else if (unformat (input, "remote %llx", &remote_cookie))
+ ;
+ else
+ break;
+ }
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "unknown interface");
+ if (local_cookie == ~0)
+ return clib_error_return (0, "local cookie required");
+ if (remote_cookie == ~0)
+ return clib_error_return (0, "remote cookie required");
+
+ rv = l2tpv3_set_tunnel_cookies (lm, sw_if_index,
+ local_cookie, remote_cookie);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX:
+ return clib_error_return (0, "invalid interface");
+
+ default:
+ return clib_error_return (0, "l2tp_session_set_cookies returned %d",
+ rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_l2tp_tunnel_cookie_command, static) = {
+ .path = "set l2tpv3 tunnel cookie",
+ .short_help =
+ "set l2tpv3 tunnel cookie <intfc> local <hex> remote <hex>",
+ .function = set_l2tp_tunnel_cookie_command_fn,
+};
+
+int l2tpv3_interface_enable_disable (vnet_main_t * vnm,
+ u32 sw_if_index,
+ int enable_disable)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_config_main_t * rx_cm = &lm->rx_config_mains[VNET_UNICAST];
+ u32 ci;
+ ip6_l2tpv3_config_t config;
+ ip4_rx_feature_type_t type;
+
+ if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index))
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ type = IP6_RX_FEATURE_L2TPV3;
+
+ ci = rx_cm->config_index_by_sw_if_index[sw_if_index];
+ ci = (enable_disable
+ ? vnet_config_add_feature
+ : vnet_config_del_feature)
+ (vlib_get_main(), &rx_cm->config_main,
+ ci,
+ type,
+ &config,
+ sizeof (config));
+ rx_cm->config_index_by_sw_if_index[sw_if_index] = ci;
+ return 0;
+}
+
+/* Enable/disable L2TPv3 intercept on IP6 fowarding path */
+static clib_error_t *
+set_ip6_l2tpv3 (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 sw_if_index = ~0;
+ int is_add = 1;
+ int rv;
+ vnet_main_t * vnm = vnet_get_main();
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "%U", unformat_vnet_sw_interface, vnm,
+ &sw_if_index))
+ ;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else
+ break;
+ }
+
+ if (sw_if_index == ~0)
+ return clib_error_return (0, "interface required");
+
+ rv = l2tpv3_interface_enable_disable (vnm, sw_if_index, is_add);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_INVALID_SW_IF_INDEX:
+ return clib_error_return (0, "invalid interface");
+
+ default:
+ return clib_error_return (0, "l2tp_interface_enable_disable returned %d",
+ rv);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_interface_ip6_l2tpv3, static) = {
+ .path = "set interface ip6 l2tpv3",
+ .function = set_ip6_l2tpv3,
+ .short_help = "set interface ip6 l2tpv3 <intfc> [del]",
+};
+
+static clib_error_t *
+l2tp_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ l2t_main_t *lm = &l2t_main;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "lookup-v6-src"))
+ lm->lookup_type = L2T_LOOKUP_SRC_ADDRESS;
+ else if (unformat (input, "lookup-v6-dst"))
+ lm->lookup_type = L2T_LOOKUP_DST_ADDRESS;
+ else if (unformat (input, "lookup-session-id"))
+ lm->lookup_type = L2T_LOOKUP_SESSION_ID;
+ else return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (l2tp_config, "l2tp");
+
+clib_error_t *l2tp_init (vlib_main_t *vm)
+{
+ l2t_main_t *lm = &l2t_main;
+ ip_main_t * im = &ip_main;
+ ip_protocol_info_t * pi;
+
+ lm->vnet_main = vnet_get_main();
+ lm->vlib_main = vm;
+ lm->lookup_type = L2T_LOOKUP_DST_ADDRESS;
+
+ lm->session_by_src_address = hash_create_mem
+ (0, sizeof (ip6_address_t) /* key bytes */,
+ sizeof (u32) /* value bytes */);
+ lm->session_by_dst_address = hash_create_mem
+ (0, sizeof (ip6_address_t) /* key bytes */,
+ sizeof (u32) /* value bytes */);
+ lm->session_by_session_id = hash_create (0, sizeof (uword));
+
+ pi = ip_get_protocol_info (im, IP_PROTOCOL_L2TP);
+ pi->unformat_pg_edit = unformat_pg_l2tp_header;
+
+ /* insure these nodes are included in build */
+ l2tp_encap_init(vm);
+ l2tp_decap_init();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(l2tp_init);
+
diff --git a/vnet/vnet/l2tp/l2tp.h b/vnet/vnet/l2tp/l2tp.h
new file mode 100644
index 00000000000..9e7ac131a7f
--- /dev/null
+++ b/vnet/vnet/l2tp/l2tp.h
@@ -0,0 +1,131 @@
+/*
+ * l2tp.h : L2TPv3 tunnel support
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_l2tp_h__
+#define __included_l2tp_h__
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2tp/packet.h>
+
+typedef struct {
+ /* ip6 addresses */
+ ip6_address_t our_address;
+ ip6_address_t client_address;
+
+ /* $$$$ maybe add encap-path lookup fib ID? */
+
+ /* l2tpv3 header parameters */
+ u64 local_cookie[2];
+ u64 remote_cookie;
+ u32 local_session_id;
+ u32 remote_session_id;
+
+ /* tunnel interface */
+ u32 hw_if_index;
+ u32 sw_if_index;
+
+ u8 l2tp_hdr_size;
+ u8 l2_sublayer_present;
+ u8 cookie_flags; /* in host byte order */
+} l2t_session_t;
+
+typedef enum {
+ L2T_LOOKUP_SRC_ADDRESS = 0,
+ L2T_LOOKUP_DST_ADDRESS,
+ L2T_LOOKUP_SESSION_ID,
+} ip6_to_l2_lookup_t;
+
+typedef struct {
+ /* session pool */
+ l2t_session_t *sessions;
+
+ /* ip6 -> l2 hash tables. Make up your minds, people... */
+ uword *session_by_src_address;
+ uword *session_by_dst_address;
+ uword *session_by_session_id;
+
+ ip6_to_l2_lookup_t lookup_type;
+
+ /* Counters */
+ vlib_combined_counter_main_t counter_main;
+
+ /* vector of free l2tpv3 tunnel interfaces */
+ u32 * free_l2tpv3_tunnel_hw_if_indices;
+
+ /* show device instance by real device instance */
+ u32 * dev_inst_by_real;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+
+} l2t_main_t;
+
+/* Packet trace structure */
+typedef struct {
+ int is_user_to_network;
+ u32 session_index;
+ ip6_address_t our_address;
+ ip6_address_t client_address;
+} l2t_trace_t;
+
+l2t_main_t l2t_main;
+vlib_node_registration_t l2t_encap_node;
+vlib_node_registration_t l2t_decap_node;
+
+enum {
+ SESSION_COUNTER_USER_TO_NETWORK=0,
+ SESSION_COUNTER_NETWORK_TO_USER,
+};
+
+static inline u32 session_index_to_counter_index (u32 session_index,
+ u32 counter_id)
+{
+ return ((session_index << 1) + counter_id);
+}
+
+u8 * format_l2t_trace (u8 * s, va_list * args);
+
+typedef struct {
+ // Any per-interface config would go here
+} ip6_l2tpv3_config_t;
+
+uword unformat_pg_l2tp_header (unformat_input_t * input, va_list * args);
+
+void l2tp_encap_init (vlib_main_t *vm);
+void l2tp_decap_init (void);
+int create_l2tpv3_ipv6_tunnel (l2t_main_t * lm,
+ ip6_address_t * client_address,
+ ip6_address_t * our_address,
+ u32 local_session_id,
+ u32 remote_session_id,
+ u64 local_cookie,
+ u64 remote_cookie,
+ int l2_sublayer_present,
+ u32 * sw_if_index);
+
+int l2tpv3_set_tunnel_cookies (l2t_main_t * lm,
+ u32 sw_if_index,
+ u64 new_local_cookie,
+ u64 new_remote_cookie);
+
+int l2tpv3_interface_enable_disable (vnet_main_t * vnm,
+ u32 sw_if_index,
+ int enable_disable);
+
+#endif /* __included_l2tp_h__ */
diff --git a/vnet/vnet/l2tp/packet.h b/vnet/vnet/l2tp/packet.h
new file mode 100644
index 00000000000..88acba41d4b
--- /dev/null
+++ b/vnet/vnet/l2tp/packet.h
@@ -0,0 +1,33 @@
+/*
+ * packet.h : L2TPv3 packet header format
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_l2tp_packet_h__
+#define __included_l2tp_packet_h__
+
+/*
+ * See RFC4719 for packet format.
+ * Note: the l2_specific_sublayer is present in current Linux l2tpv3
+ * tunnels. It is not present in IOS XR l2tpv3 tunnels.
+ * The Linux implementation is almost certainly wrong.
+ */
+typedef CLIB_PACKED(struct {
+ u32 session_id;
+ u64 cookie;
+ u32 l2_specific_sublayer; /* set to 0 (if present) */
+}) l2tpv3_header_t;
+
+#endif /* __included_l2tp_packet_h__ */
diff --git a/vnet/vnet/l2tp/pg.c b/vnet/vnet/l2tp/pg.c
new file mode 100644
index 00000000000..394e14681b1
--- /dev/null
+++ b/vnet/vnet/l2tp/pg.c
@@ -0,0 +1,94 @@
+/*
+ * pg.c: packet generator for L2TPv3 header
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/l2tp/l2tp.h>
+
+typedef struct {
+ pg_edit_t session_id;
+ pg_edit_t cookie;
+} pg_l2tp_header_t;
+
+typedef struct {
+ pg_edit_t l2_sublayer;
+} pg_l2tp_header_l2_sublayer_t;
+
+static inline void
+pg_l2tp_header_init (pg_l2tp_header_t * e)
+{
+ pg_edit_init (&e->session_id, l2tpv3_header_t, session_id);
+ pg_edit_init (&e->cookie, l2tpv3_header_t, cookie);
+}
+
+uword
+unformat_pg_l2tp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_l2tp_header_t * h;
+ u32 group_index, error;
+ vlib_main_t * vm = vlib_get_main();
+
+ h = pg_create_edit_group (s, sizeof (h[0]),
+ sizeof (l2tpv3_header_t) - sizeof(u32),
+ &group_index);
+ pg_l2tp_header_init (h);
+
+ error = 1;
+
+ // session id and cookie are required
+ if (! unformat (input, "L2TP: session_id %U cookie %U",
+ unformat_pg_edit, unformat_pg_number, &h->session_id,
+ unformat_pg_edit, unformat_pg_number, &h->cookie)) {
+ goto done;
+ }
+
+ // "l2_sublayer <value>" is optional
+ if (unformat (input, "l2_sublayer")) {
+ pg_l2tp_header_l2_sublayer_t * h2;
+
+ h2 = pg_add_edits (s, sizeof (h2[0]), sizeof(u32), group_index);
+ pg_edit_init (&h2->l2_sublayer, l2tpv3_header_t, l2_specific_sublayer);
+ if (! unformat_user (input, unformat_pg_edit,
+ unformat_pg_number, &h2->l2_sublayer)) {
+ goto done;
+ }
+ }
+
+ // Parse an ethernet header if it is present
+ {
+ pg_node_t * pg_node = 0;
+ vlib_node_t * eth_lookup_node;
+
+ eth_lookup_node = vlib_get_node_by_name (vm, (u8 *)"ethernet-input");
+ ASSERT (eth_lookup_node);
+
+ pg_node = pg_get_node (eth_lookup_node->index);
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+ }
+
+ error = 0;
+
+done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/l3_types.h b/vnet/vnet/l3_types.h
new file mode 100644
index 00000000000..2902072148c
--- /dev/null
+++ b/vnet/vnet/l3_types.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * l3_types.h: layer 3 packet types
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_l3_types_h
+#define included_vnet_l3_types_h
+
+/* Inherit generic L3 packet types from ethernet. */
+typedef enum {
+#define ethernet_type(n,f) VNET_L3_PACKET_TYPE_##f,
+#include <vnet/ethernet/types.def>
+#undef ethernet_type
+} vnet_l3_packet_type_t;
+
+#endif /* included_vnet_l3_types_h */
diff --git a/vnet/vnet/lawful-intercept/lawful_intercept.c b/vnet/vnet/lawful-intercept/lawful_intercept.c
new file mode 100644
index 00000000000..bd3f33efa0c
--- /dev/null
+++ b/vnet/vnet/lawful-intercept/lawful_intercept.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/lawful-intercept/lawful_intercept.h>
+
+static clib_error_t *
+set_li_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ li_main_t * lm = &li_main;
+ ip4_address_t collector;
+ u8 collector_set = 0;
+ ip4_address_t src;
+ u8 src_set = 0;
+ u32 tmp;
+ u16 udp_port = 0;
+ u8 is_add = 1;
+ int i;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "collector %U", unformat_ip4_address, &collector))
+ collector_set = 1;
+ if (unformat (input, "src %U", unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (input, "udp-port %d", &tmp))
+ udp_port = tmp;
+ else if (unformat (input, "del"))
+ is_add = 0;
+ else
+ break;
+ }
+
+ if (collector_set == 0)
+ return clib_error_return (0, "collector must be set...");
+ if (src_set == 0)
+ return clib_error_return (0, "src must be set...");
+ if (udp_port == 0)
+ return clib_error_return (0, "udp-port must be set...");
+
+ if (is_add == 1)
+ {
+ for (i = 0; i < vec_len (lm->collectors); i++)
+ {
+ if (lm->collectors[i].as_u32 == collector.as_u32)
+ {
+ if (lm->ports[i] == udp_port)
+ return clib_error_return
+ (0, "collector %U:%d already configured",
+ &collector, udp_port);
+ else
+ return clib_error_return
+ (0, "collector %U already configured with port %d",
+ &collector, (int)(lm->ports[i]));
+ }
+ }
+ vec_add1 (lm->collectors, collector);
+ vec_add1 (lm->ports, udp_port);
+ vec_add1 (lm->src_addrs, src);
+ return 0;
+ }
+ else
+ {
+ for (i = 0; i < vec_len (lm->collectors); i++)
+ {
+ if ((lm->collectors[i].as_u32 == collector.as_u32)
+ && lm->ports[i] == udp_port)
+ {
+ vec_delete (lm->collectors, 1, i);
+ vec_delete (lm->ports, 1, i);
+ vec_delete (lm->src_addrs, 1, i);
+ return 0;
+ }
+ }
+ return clib_error_return (0, "collector %U:%d not configured",
+ &collector, udp_port);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_li_command, static) = {
+ .path = "set li",
+ .short_help =
+ "set li src <ip4-address> collector <ip4-address> udp-port <nnnn>",
+ .function = set_li_command_fn,
+};
+
+static clib_error_t *
+li_init (vlib_main_t * vm)
+{
+ li_main_t * lm = &li_main;
+
+ lm->vlib_main = vm;
+ lm->vnet_main = vnet_get_main();
+ lm->hit_node_index = li_hit_node.index;
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(li_init);
diff --git a/vnet/vnet/lawful-intercept/lawful_intercept.h b/vnet/vnet/lawful-intercept/lawful_intercept.h
new file mode 100644
index 00000000000..f6cbf66e076
--- /dev/null
+++ b/vnet/vnet/lawful-intercept/lawful_intercept.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __lawful_intercept_h__
+#define __lawful_intercept_h__
+
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/dpdk_replication.h>
+
+typedef struct {
+ /* LI collector info */
+ ip4_address_t * src_addrs;
+ ip4_address_t * collectors;
+ u16 * ports;
+
+ /* Hit node index */
+ u32 hit_node_index;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} li_main_t;
+
+li_main_t li_main;
+
+typedef CLIB_PACKED(struct {
+ ip4_header_t ip4;
+ udp_header_t udp;
+}) ip4_udp_header_t;
+
+vlib_node_registration_t li_hit_node;
+
+#endif /* __lawful_intercept_h__ */
diff --git a/vnet/vnet/lawful-intercept/node.c b/vnet/vnet/lawful-intercept/node.c
new file mode 100644
index 00000000000..cc066491116
--- /dev/null
+++ b/vnet/vnet/lawful-intercept/node.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+
+#include <vnet/lawful-intercept/lawful_intercept.h>
+
+#include <vppinfra/error.h>
+#include <vppinfra/elog.h>
+
+vlib_node_registration_t li_hit_node;
+
+typedef struct {
+ u32 next_index;
+} li_hit_trace_t;
+
+/* packet trace format function */
+static u8 * format_li_hit_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ li_hit_trace_t * t = va_arg (*args, li_hit_trace_t *);
+
+ s = format (s, "LI_HIT: next index %d", t->next_index);
+
+ return s;
+}
+
+vlib_node_registration_t li_hit_node;
+
+#define foreach_li_hit_error \
+_(HITS, "LI packets processed") \
+_(NO_COLLECTOR, "No collector configured")
+
+typedef enum {
+#define _(sym,str) LI_HIT_ERROR_##sym,
+ foreach_li_hit_error
+#undef _
+ LI_HIT_N_ERROR,
+} li_hit_error_t;
+
+static char * li_hit_error_strings[] = {
+#define _(sym,string) string,
+ foreach_li_hit_error
+#undef _
+};
+
+typedef enum {
+ LI_HIT_NEXT_ETHERNET,
+ LI_HIT_N_NEXT,
+} li_hit_next_t;
+
+static uword
+li_hit_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ li_hit_next_t next_index;
+ vlib_frame_t * int_frame = 0;
+ u32 * to_int_next = 0;
+ li_main_t * lm = &li_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ if (PREDICT_FALSE (vec_len (lm->collectors) == 0))
+ {
+ vlib_node_increment_counter (vm, li_hit_node.index,
+ LI_HIT_ERROR_NO_COLLECTOR,
+ n_left_from);
+ }
+ else
+ {
+ /* The intercept frame... */
+ int_frame = vlib_get_frame_to_node (vm, ip4_lookup_node.index);
+ to_int_next = vlib_frame_vector_args (int_frame);
+ }
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 next0 = LI_HIT_NEXT_INTERFACE_OUTPUT;
+ u32 next1 = LI_HIT_NEXT_INTERFACE_OUTPUT;
+ u32 sw_if_index0, sw_if_index1;
+ u8 tmp0[6], tmp1[6];
+ ethernet_header_t *en0, *en1;
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ Dual loop: process 2 x packets here $$$$$ */
+ ASSERT (b0->current_data == 0);
+ ASSERT (b1->current_data == 0);
+
+ en0 = vlib_buffer_get_current (b0);
+ en1 = vlib_buffer_get_current (b1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+
+ /* Send pkt back out the RX interface */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = sw_if_index0;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = sw_if_index1;
+
+ /* $$$$$ End of processing 2 x packets $$$$$ */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ li_hit_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ li_hit_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif /* $$$ dual-loop off */
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ vlib_buffer_t * c0;
+ ip4_udp_header_t * iu0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ u32 next0 = LI_HIT_NEXT_ETHERNET;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ if (PREDICT_TRUE(to_int_next != 0))
+ {
+ /* Make an intercept copy */
+ c0 = vlib_dpdk_clone_buffer (vm, b0);
+
+ vlib_buffer_advance(c0, -sizeof(*iu0));
+
+ iu0 = vlib_buffer_get_current(c0);
+ ip0 = &iu0->ip4;
+
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_UDP;
+
+ ip0->src_address.as_u32 = lm->src_addrs[0].as_u32;
+ ip0->dst_address.as_u32 = lm->collectors[0].as_u32;
+ ip0->length = vlib_buffer_length_in_chain (vm, c0);
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ udp0 = &iu0->udp;
+ udp0->src_port = udp0->dst_port =
+ clib_host_to_net_u16(lm->ports[0]);
+ udp0->checksum = 0;
+ udp0->length =
+ clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm , b0));
+
+ to_int_next [0] = vlib_get_buffer_index (vm, c0);
+ to_int_next++;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ li_hit_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ if (int_frame)
+ {
+ int_frame->n_vectors = frame->n_vectors;
+ vlib_put_frame_to_node (vm, ip4_lookup_node.index, int_frame);
+ }
+
+ vlib_node_increment_counter (vm, li_hit_node.index,
+ LI_HIT_ERROR_HITS, frame->n_vectors);
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (li_hit_node) = {
+ .function = li_hit_node_fn,
+ .name = "li-hit",
+ .vector_size = sizeof (u32),
+ .format_trace = format_li_hit_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(li_hit_error_strings),
+ .error_strings = li_hit_error_strings,
+
+ .n_next_nodes = LI_HIT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [LI_HIT_NEXT_ETHERNET] = "ethernet-input-not-l2",
+ },
+};
diff --git a/vnet/vnet/lisp-gpe/decap.c b/vnet/vnet/lisp-gpe/decap.c
new file mode 100644
index 00000000000..e10f1f2e399
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/decap.c
@@ -0,0 +1,298 @@
+/*
+ * decap.c: lisp-gpe decap processing
+ *
+ * Copyright (c) 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/lisp-gpe/lisp_gpe.h>
+
+typedef struct {
+ u32 next_index;
+ u32 tunnel_index;
+ u32 error;
+ lisp_gpe_header_t h;
+} lisp_gpe_rx_trace_t;
+
+static u8 * format_lisp_gpe_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ lisp_gpe_rx_trace_t * t = va_arg (*args, lisp_gpe_rx_trace_t *);
+
+ if (t->tunnel_index != ~0)
+ {
+ s = format (s, "NSH-VXLAN: tunnel %d next %d error %d", t->tunnel_index,
+ t->next_index, t->error);
+ }
+ else
+ {
+ s = format (s, "NSH-VXLAN: no tunnel next %d error %d\n", t->next_index,
+ t->error);
+ }
+ s = format (s, "\n %U", format_lisp_gpe_header_with_length, &t->h,
+ (u32) sizeof (t->h) /* max size */);
+ return s;
+}
+
+static uword
+lisp_gpe_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ lisp_gpe_main_t * ngm = &lisp_gpe_main;
+ u32 last_tunnel_index = ~0;
+ lisp_gpe_tunnel_key_t last_key;
+ u32 pkts_decapsulated = 0;
+
+ memset (&last_key, 0xff, sizeof (last_key));
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ nsh_unicast_header_t * h0, * h1;
+ u32 label0, label1;
+ u32 next0, next1;
+ uword * p0, * p1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ next0 = next1 = LISP_GPE_INPUT_NEXT_IP4_INPUT;
+
+ label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
+ label1 = clib_net_to_host_u32 (h1->label_exp_s_ttl);
+
+ /*
+ * Translate label contents into a fib index.
+ * This is a decent sanity check, and guarantees
+ * a sane FIB for the downstream lookup
+ */
+ label0 = vnet_nsh_uc_get_label (label0);
+ label1 = vnet_nsh_uc_get_label (label1);
+
+ /* If 2xlabels match, and match the 1-wide cache, use it */
+ if (label0 == label1 && rt->last_label == label0)
+ {
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = rt->last_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = rt->last_fib_index;
+ }
+ else
+ {
+ p0 = hash_get (rt->mm->fib_index_by_nsh_label, label0);
+ if (PREDICT_FALSE (p0 == 0))
+ {
+ next0 = LISP_GPE_INPUT_NEXT_DROP;
+ b0->error = node->errors[NSH_ERROR_BAD_LABEL];
+ }
+ else
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = p0[0];
+
+ p1 = hash_get (rt->mm->fib_index_by_nsh_label, label1);
+ if (PREDICT_FALSE (p1 == 0))
+ {
+ next1 = LISP_GPE_INPUT_NEXT_DROP;
+ b1->error = node->errors[NSH_ERROR_BAD_LABEL];
+ }
+ else
+ {
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = p1[0];
+ rt->last_fib_index = p1[0];
+ rt->last_label = label1;
+ }
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->label_exp_s_ttl = label0;
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->label_exp_s_ttl = label1;
+ }
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ ip4_udp_lisp_gpe_header_t * iul0;
+ uword * p0;
+ u32 tunnel_index0;
+ lisp_gpe_tunnel_t * t0;
+ lisp_gpe_tunnel_key_t key0;
+ u32 error0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* udp leaves current_data pointing at the lisp header */
+ vlib_buffer_advance
+ (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+
+ iul0 = vlib_buffer_get_current (b0);
+
+ /* pop (ip, udp, lisp-gpe) */
+ vlib_buffer_advance (b0, sizeof (*iul0));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+ next0 = LISP_GPE_INPUT_NEXT_DROP;
+
+ key0.src = iul0->ip4.src_address.as_u32;
+ key0.iid = iul0->lisp.iid;
+
+ if (PREDICT_FALSE ((key0.as_u64[0] != last_key.as_u64[0])))
+ {
+ p0 = hash_get_mem (ngm->lisp_gpe_tunnel_by_key, &key0);
+
+ if (p0 == 0)
+ {
+ error0 = LISP_GPE_ERROR_NO_SUCH_TUNNEL;
+ goto trace0;
+ }
+
+ last_key.as_u64[0] = key0.as_u64[0];
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (ngm->tunnels, tunnel_index0);
+
+ next0 = t0->decap_next_index;
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ /*
+ * ip[46] lookup in the configured FIB
+ * lisp-gpe-encap, here's the encap tunnel sw_if_index
+ */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->decap_fib_index;
+ pkts_decapsulated ++;
+
+ trace0:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ lisp_gpe_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->h = iul0->lisp;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, lisp_gpe_input_node.index,
+ LISP_GPE_ERROR_DECAPSULATED,
+ pkts_decapsulated);
+ return from_frame->n_vectors;
+}
+
+static char * lisp_gpe_error_strings[] = {
+#define lisp_gpe_error(n,s) s,
+#include <vnet/lisp-gpe/lisp_gpe_error.def>
+#undef lisp_gpe_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (lisp_gpe_input_node) = {
+ .function = lisp_gpe_input,
+ .name = "lisp-gpe-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = LISP_GPE_N_ERROR,
+ .error_strings = lisp_gpe_error_strings,
+
+ .n_next_nodes = LISP_GPE_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [LISP_GPE_INPUT_NEXT_##s] = n,
+ foreach_lisp_gpe_input_next
+#undef _
+ },
+
+ .format_buffer = format_lisp_gpe_header_with_length,
+ .format_trace = format_lisp_gpe_rx_trace,
+ // $$$$ .unformat_buffer = unformat_lisp_gpe_header,
+};
diff --git a/vnet/vnet/lisp-gpe/encap.c b/vnet/vnet/lisp-gpe/encap.c
new file mode 100644
index 00000000000..b3a52c464be
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/encap.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/lisp-gpe/lisp_gpe.h>
+
+/* Statistics (not really errors) */
+#define foreach_lisp_gpe_encap_error \
+_(ENCAPSULATED, "good packets encapsulated")
+
+static char * lisp_gpe_encap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_lisp_gpe_encap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) LISP_GPE_ENCAP_ERROR_##sym,
+ foreach_lisp_gpe_encap_error
+#undef _
+ LISP_GPE_ENCAP_N_ERROR,
+} lisp_gpe_encap_error_t;
+
+typedef enum {
+ LISP_GPE_ENCAP_NEXT_IP4_LOOKUP,
+ LISP_GPE_ENCAP_NEXT_DROP,
+ LISP_GPE_ENCAP_N_NEXT,
+} lisp_gpe_encap_next_t;
+
+typedef struct {
+ u32 tunnel_index;
+} lisp_gpe_encap_trace_t;
+
+u8 * format_lisp_gpe_encap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ lisp_gpe_encap_trace_t * t
+ = va_arg (*args, lisp_gpe_encap_trace_t *);
+
+ s = format (s, "LISP-GPE-ENCAP: tunnel %d", t->tunnel_index);
+ return s;
+}
+
+#define foreach_fixed_header_offset \
+_(0) _(1) _(2) _(3)
+
+static uword
+lisp_gpe_encap (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ lisp_gpe_main_t * ngm = &lisp_gpe_main;
+ vnet_main_t * vnm = ngm->vnet_main;
+ u32 pkts_encapsulated = 0;
+ u16 old_l0 = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ nsh_unicast_header_t * h0, * h1;
+ u32 label0, label1;
+ u32 next0, next1;
+ uword * p0, * p1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ next0 = next1 = NSH_INPUT_NEXT_IP4_INPUT;
+
+ label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
+ label1 = clib_net_to_host_u32 (h1->label_exp_s_ttl);
+
+ /*
+ * Translate label contents into a fib index.
+ * This is a decent sanity check, and guarantees
+ * a sane FIB for the downstream lookup
+ */
+ label0 = vnet_nsh_uc_get_label (label0);
+ label1 = vnet_nsh_uc_get_label (label1);
+
+ /* If 2xlabels match, and match the 1-wide cache, use it */
+ if (label0 == label1 && rt->last_label == label0)
+ {
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = rt->last_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = rt->last_fib_index;
+ }
+ else
+ {
+ p0 = hash_get (rt->mm->fib_index_by_nsh_label, label0);
+ if (PREDICT_FALSE (p0 == 0))
+ {
+ next0 = NSH_INPUT_NEXT_DROP;
+ b0->error = node->errors[NSH_ERROR_BAD_LABEL];
+ }
+ else
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = p0[0];
+
+ p1 = hash_get (rt->mm->fib_index_by_nsh_label, label1);
+ if (PREDICT_FALSE (p1 == 0))
+ {
+ next1 = NSH_INPUT_NEXT_DROP;
+ b1->error = node->errors[NSH_ERROR_BAD_LABEL];
+ }
+ else
+ {
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = p1[0];
+ rt->last_fib_index = p1[0];
+ rt->last_label = label1;
+ }
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->label_exp_s_ttl = label0;
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->label_exp_s_ttl = label1;
+ }
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = LISP_GPE_ENCAP_NEXT_IP4_LOOKUP;
+ vnet_hw_interface_t * hi0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ u64 * copy_src0, * copy_dst0;
+ u32 * copy_src_last0, * copy_dst_last0;
+ lisp_gpe_tunnel_t * t0;
+ u16 new_l0;
+ ip_csum_t sum0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* 1-wide cache? */
+ hi0 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+
+ t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
+
+ ASSERT(vec_len(t0->rewrite) >= 24);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+
+ ASSERT (sizeof (ip4_udp_lisp_gpe_header_t) == 36);
+
+ /* Copy first 32 octets 8-bytes at a time */
+#define _(offs) copy_dst0[offs] = copy_src0[offs];
+ foreach_fixed_header_offset;
+#undef _
+ /* Last 4 octets. Hopefully gcc will be our friend */
+ copy_dst_last0 = (u32 *)(&copy_dst0[4]);
+ copy_src_last0 = (u32 *)(&copy_src0[4]);
+
+ copy_dst_last0[0] = copy_src_last0[0];
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ /* Fix UDP length */
+ udp0 = (udp_header_t *)(ip0+1);
+ new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
+ - sizeof (*ip0));
+
+ udp0->length = new_l0;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ pkts_encapsulated ++;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ lisp_gpe_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - ngm->tunnels;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, node->node_index,
+ LISP_GPE_ENCAP_ERROR_ENCAPSULATED,
+ pkts_encapsulated);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (lisp_gpe_encap_node) = {
+ .function = lisp_gpe_encap,
+ .name = "lisp-gpe-encap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_lisp_gpe_encap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(lisp_gpe_encap_error_strings),
+ .error_strings = lisp_gpe_encap_error_strings,
+
+ .n_next_nodes = LISP_GPE_ENCAP_N_NEXT,
+
+ .next_nodes = {
+ [LISP_GPE_ENCAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [LISP_GPE_ENCAP_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe.c b/vnet/vnet/lisp-gpe/lisp_gpe.c
new file mode 100644
index 00000000000..eb4ca919b20
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/lisp_gpe.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/lisp-gpe/lisp_gpe.h>
+
+lisp_gpe_main_t lisp_gpe_main;
+
+static u8 * format_decap_next (u8 * s, va_list * args)
+{
+ u32 next_index = va_arg (*args, u32);
+
+ switch (next_index)
+ {
+ case LISP_GPE_INPUT_NEXT_DROP:
+ return format (s, "drop");
+ case LISP_GPE_INPUT_NEXT_IP4_INPUT:
+ return format (s, "ip4");
+ case LISP_GPE_INPUT_NEXT_IP6_INPUT:
+ return format (s, "ip6");
+ case LISP_GPE_INPUT_NEXT_LISP_GPE_ENCAP:
+ return format (s, "nsh-lisp-gpe");
+ default:
+ return format (s, "unknown %d", next_index);
+ }
+ return s;
+}
+
+u8 * format_lisp_gpe_tunnel (u8 * s, va_list * args)
+{
+ lisp_gpe_tunnel_t * t = va_arg (*args, lisp_gpe_tunnel_t *);
+ lisp_gpe_main_t * ngm = &lisp_gpe_main;
+
+ s = format (s,
+ "[%d] %U (src) %U (dst) fibs: encap %d, decap %d",
+ t - ngm->tunnels,
+ format_ip4_address, &t->src,
+ format_ip4_address, &t->dst,
+ t->encap_fib_index,
+ t->decap_fib_index);
+
+ s = format (s, " decap next %U\n", format_decap_next, t->decap_next_index);
+ s = format (s, "lisp ver %d ", (t->ver_res>>6));
+
+#define _(n,v) if (t->flags & v) s = format (s, "%s-bit ", #n);
+ foreach_lisp_gpe_flag_bit;
+#undef _
+
+ s = format (s, "next_protocol %d ver_res %x res %x\n",
+ t->next_protocol, t->ver_res, t->res);
+
+ s = format (s, "iid %d (0x%x)\n", t->iid, t->iid);
+ return s;
+}
+
+static u8 * format_lisp_gpe_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "lisp_gpe_tunnel%d", dev_instance);
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (lisp_gpe_device_class,static) = {
+ .name = "LISP_GPE",
+ .format_device_name = format_lisp_gpe_name,
+ .format_tx_trace = format_lisp_gpe_encap_trace,
+ .tx_function = dummy_interface_tx,
+};
+
+static uword dummy_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ return 0;
+}
+
+u8 * format_lisp_gpe_header_with_length (u8 * s, va_list * args)
+{
+ lisp_gpe_header_t * h = va_arg (*args, lisp_gpe_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "gre-nsh header truncated");
+
+ s = format (s, "flags: ");
+#define _(n,v) if (h->flags & v) s = format (s, "%s ", #n);
+ foreach_lisp_gpe_flag_bit;
+#undef _
+
+ s = format (s, "\n ver_res %d res %d next_protocol %d iid %d(%x)",
+ h->ver_res, h->res, h->next_protocol,
+ clib_net_to_host_u32 (h->iid),
+ clib_net_to_host_u32 (h->iid));
+ return s;
+}
+
+VNET_HW_INTERFACE_CLASS (lisp_gpe_hw_class) = {
+ .name = "LISP_GPE",
+ .format_header = format_lisp_gpe_header_with_length,
+ .set_rewrite = dummy_set_rewrite,
+};
+
+#define foreach_copy_field \
+_(src.as_u32) \
+_(dst.as_u32) \
+_(encap_fib_index) \
+_(decap_fib_index) \
+_(decap_next_index) \
+_(flags) \
+_(next_protocol) \
+_(ver_res) \
+_(res) \
+_(iid)
+
+static int lisp_gpe_rewrite (lisp_gpe_tunnel_t * t)
+{
+ u8 *rw = 0;
+ ip4_header_t * ip0;
+ lisp_gpe_header_t * lisp0;
+ ip4_udp_lisp_gpe_header_t * h0;
+ int len;
+
+ len = sizeof (*h0);
+
+ vec_validate_aligned (rw, len-1, CLIB_CACHE_LINE_BYTES);
+
+ h0 = (ip4_udp_lisp_gpe_header_t *) rw;
+
+ /* Fixed portion of the (outer) ip4 header */
+ ip0 = &h0->ip4;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_UDP;
+
+ /* we fix up the ip4 header length and checksum after-the-fact */
+ ip0->src_address.as_u32 = t->src.as_u32;
+ ip0->dst_address.as_u32 = t->dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ /* UDP header, randomize src port on something, maybe? */
+ h0->udp.src_port = clib_host_to_net_u16 (4341);
+ h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_lisp_gpe);
+
+ /* LISP-gpe header */
+ lisp0 = &h0->lisp;
+
+ lisp0->flags = t->flags;
+ lisp0->ver_res = t->ver_res;
+ lisp0->res = t->res;
+ lisp0->next_protocol = t->next_protocol;
+ lisp0->iid = clib_host_to_net_u32 (t->iid);
+
+ t->rewrite = rw;
+ return (0);
+}
+
+int vnet_lisp_gpe_add_del_tunnel
+(vnet_lisp_gpe_add_del_tunnel_args_t *a, u32 * sw_if_indexp)
+{
+ lisp_gpe_main_t * ngm = &lisp_gpe_main;
+ lisp_gpe_tunnel_t *t = 0;
+ vnet_main_t * vnm = ngm->vnet_main;
+ vnet_hw_interface_t * hi;
+ uword * p;
+ u32 hw_if_index = ~0;
+ u32 sw_if_index = ~0;
+ int rv;
+ lisp_gpe_tunnel_key_t key, *key_copy;
+ hash_pair_t *hp;
+
+ key.src = a->src.as_u32;
+ key.iid = clib_host_to_net_u32(a->iid);
+
+ p = hash_get_mem (ngm->lisp_gpe_tunnel_by_key, &key);
+
+ if (a->is_add)
+ {
+ /* adding a tunnel: tunnel must not already exist */
+ if (p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ if (a->decap_next_index >= LISP_GPE_INPUT_N_NEXT)
+ return VNET_API_ERROR_INVALID_DECAP_NEXT;
+
+ pool_get_aligned (ngm->tunnels, t, CLIB_CACHE_LINE_BYTES);
+ memset (t, 0, sizeof (*t));
+
+ /* copy from arg structure */
+#define _(x) t->x = a->x;
+ foreach_copy_field;
+#undef _
+
+ rv = lisp_gpe_rewrite (t);
+
+ if (rv)
+ {
+ pool_put (ngm->tunnels, t);
+ return rv;
+ }
+
+ key_copy = clib_mem_alloc (sizeof (*key_copy));
+ memcpy (key_copy, &key, sizeof (*key_copy));
+
+ hash_set_mem (ngm->lisp_gpe_tunnel_by_key, key_copy,
+ t - ngm->tunnels);
+
+ if (vec_len (ngm->free_lisp_gpe_tunnel_hw_if_indices) > 0)
+ {
+ hw_if_index = ngm->free_lisp_gpe_tunnel_hw_if_indices
+ [vec_len (ngm->free_lisp_gpe_tunnel_hw_if_indices)-1];
+ _vec_len (ngm->free_lisp_gpe_tunnel_hw_if_indices) -= 1;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = t - ngm->tunnels;
+ hi->hw_instance = hi->dev_instance;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, lisp_gpe_device_class.index, t - ngm->tunnels,
+ lisp_gpe_hw_class.index, t - ngm->tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = lisp_gpe_encap_node.index;
+ }
+
+ t->hw_if_index = hw_if_index;
+ t->sw_if_index = sw_if_index = hi->sw_if_index;
+
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+ }
+ else
+ {
+ /* deleting a tunnel: tunnel must exist */
+ if (!p)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ t = pool_elt_at_index (ngm->tunnels, p[0]);
+
+ vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */);
+ vec_add1 (ngm->free_lisp_gpe_tunnel_hw_if_indices, t->hw_if_index);
+
+ hp = hash_get_pair (ngm->lisp_gpe_tunnel_by_key, &key);
+ key_copy = (void *)(hp->key);
+ hash_unset_mem (ngm->lisp_gpe_tunnel_by_key, &key);
+ clib_mem_free (key_copy);
+
+ vec_free (t->rewrite);
+ pool_put (ngm->tunnels, t);
+ }
+
+ if (sw_if_indexp)
+ *sw_if_indexp = sw_if_index;
+
+ return 0;
+}
+
+static u32 fib_index_from_fib_id (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (!p)
+ return ~0;
+
+ return p[0];
+}
+
+static uword unformat_decap_next (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 tmp;
+
+ if (unformat (input, "drop"))
+ *result = LISP_GPE_INPUT_NEXT_DROP;
+ else if (unformat (input, "ip4"))
+ *result = LISP_GPE_INPUT_NEXT_IP4_INPUT;
+ else if (unformat (input, "ip6"))
+ *result = LISP_GPE_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "ethernet"))
+ *result = LISP_GPE_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "lisp-gpe"))
+ *result = LISP_GPE_INPUT_NEXT_LISP_GPE_ENCAP;
+ else if (unformat (input, "%d", &tmp))
+ *result = tmp;
+ else
+ return 0;
+ return 1;
+}
+
+static clib_error_t *
+lisp_gpe_add_del_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t src, dst;
+ u8 is_add = 1;
+ u8 src_set = 0;
+ u8 dst_set = 0;
+ u32 encap_fib_index = 0;
+ u32 decap_fib_index = 0;
+ u8 next_protocol = LISP_GPE_NEXT_PROTOCOL_IP4;
+ u32 decap_next_index = LISP_GPE_INPUT_NEXT_IP4_INPUT;
+ u8 flags = LISP_GPE_FLAGS_P;
+ u8 ver_res = 0;
+ u8 res = 0;
+ u32 iid = 0;
+ u8 iid_set = 0;
+ u32 tmp;
+ int rv;
+ vnet_lisp_gpe_add_del_tunnel_args_t _a, * a = &_a;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "src %U",
+ unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (line_input, "dst %U",
+ unformat_ip4_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "encap-vrf-id %d", &tmp))
+ {
+ encap_fib_index = fib_index_from_fib_id (tmp);
+ if (encap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent encap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-vrf-id %d", &tmp))
+ {
+ decap_fib_index = fib_index_from_fib_id (tmp);
+ if (decap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent decap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-next %U", unformat_decap_next,
+ &decap_next_index))
+ ;
+ else if (unformat(line_input, "next-ip4"))
+ next_protocol = 1;
+ else if (unformat(line_input, "next-ip6"))
+ next_protocol = 2;
+ else if (unformat(line_input, "next-ethernet"))
+ next_protocol = 3;
+ else if (unformat(line_input, "next-nsh"))
+ next_protocol = 4;
+ /* Allow the user to specify anything they want in the LISP hdr */
+ else if (unformat (line_input, "ver_res %x", &tmp))
+ ver_res = tmp;
+ else if (unformat (line_input, "res %x", &tmp))
+ res = tmp;
+ else if (unformat (line_input, "flags %x", &tmp))
+ flags = tmp;
+ else if (unformat (line_input, "n-bit"))
+ flags |= LISP_GPE_FLAGS_N;
+ else if (unformat (line_input, "l-bit"))
+ flags |= LISP_GPE_FLAGS_L;
+ else if (unformat (line_input, "e-bit"))
+ flags |= LISP_GPE_FLAGS_E;
+ else if (unformat (line_input, "v-bit"))
+ flags |= LISP_GPE_FLAGS_V;
+ else if (unformat (line_input, "i-bit"))
+ flags |= LISP_GPE_FLAGS_V;
+ else if (unformat (line_input, "not-p-bit"))
+ flags &= ~LISP_GPE_FLAGS_P;
+ else if (unformat (line_input, "p-bit"))
+ flags |= LISP_GPE_FLAGS_P;
+ else if (unformat (line_input, "o-bit"))
+ flags |= LISP_GPE_FLAGS_O;
+ else if (unformat (line_input, "iidx %x", &iid))
+ iid_set = 1;
+ else if (unformat (line_input, "iid %d", &iid))
+ iid_set = 1;
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (src_set == 0)
+ return clib_error_return (0, "tunnel src address not specified");
+
+ if (dst_set == 0)
+ return clib_error_return (0, "tunnel dst address not specified");
+
+ if (iid_set == 0)
+ return clib_error_return (0, "iid not specified");
+
+ memset (a, 0, sizeof (*a));
+
+ a->is_add = is_add;
+
+#define _(x) a->x = x;
+ foreach_copy_field;
+#undef _
+
+ rv = vnet_lisp_gpe_add_del_tunnel (a, 0 /* hw_if_indexp */);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_DECAP_NEXT:
+ return clib_error_return (0, "invalid decap-next...");
+
+ case VNET_API_ERROR_TUNNEL_EXIST:
+ return clib_error_return (0, "tunnel already exists...");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel does not exist...");
+
+ default:
+ return clib_error_return
+ (0, "vnet_lisp_gpe_add_del_tunnel returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_lisp_gpe_tunnel_command, static) = {
+ .path = "lisp gpe tunnel",
+ .short_help =
+ "lisp gpe tunnel src <ip4-addr> dst <ip4-addr> iidx <0xnn> | iid <nn>\n"
+ " [encap-fib-id <nn>] [decap-fib-id <nn>]\n"
+ " [n-bit][l-bit][e-bit][v-bit][i-bit][p-bit][not-p-bit][o-bit]\n"
+ " [next-ip4][next-ip6][next-ethernet][next-nsh]\n"
+ " [decap-next [ip4|ip6|ethernet|nsh-encap|<nn>]][del]\n",
+ .function = lisp_gpe_add_del_tunnel_command_fn,
+};
+
+static clib_error_t *
+show_lisp_gpe_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ lisp_gpe_main_t * ngm = &lisp_gpe_main;
+ lisp_gpe_tunnel_t * t;
+
+ if (pool_elts (ngm->tunnels) == 0)
+ vlib_cli_output (vm, "No lisp-gpe tunnels configured...");
+
+ pool_foreach (t, ngm->tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_lisp_gpe_tunnel, t);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_lisp_gpe_tunnel_command, static) = {
+ .path = "show lisp gpe tunnel",
+ .function = show_lisp_gpe_tunnel_command_fn,
+};
+
+clib_error_t *lisp_gpe_init (vlib_main_t *vm)
+{
+ lisp_gpe_main_t *ngm = &lisp_gpe_main;
+
+ ngm->vnet_main = vnet_get_main();
+ ngm->vlib_main = vm;
+
+ ngm->lisp_gpe_tunnel_by_key
+ = hash_create_mem (0, sizeof(lisp_gpe_tunnel_key_t), sizeof (uword));
+
+ udp_register_dst_port (vm, UDP_DST_PORT_lisp_gpe,
+ lisp_gpe_input_node.index, 1 /* is_ip4 */);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(lisp_gpe_init);
+
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe.h b/vnet/vnet/lisp-gpe/lisp_gpe.h
new file mode 100644
index 00000000000..8ca721dda84
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/lisp_gpe.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_lisp_gpe_h
+#define included_vnet_lisp_gpe_h
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/lisp-gpe/lisp_gpe_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/udp.h>
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4; /* 20 bytes */
+ udp_header_t udp; /* 8 bytes */
+ lisp_gpe_header_t lisp; /* 8 bytes */
+}) ip4_udp_lisp_gpe_header_t;
+
+typedef CLIB_PACKED(struct {
+ /*
+ * Key fields: ip src, LISP iid, ??? $$$$$$$$$ correct answer ???
+ * all fields in NET byte order
+ */
+ union {
+ struct {
+ u32 src;
+ u32 iid;
+ };
+ u64 as_u64[1];
+ };
+}) lisp_gpe_tunnel_key_t;
+
+typedef struct {
+ /* Rewrite string. $$$$ embed vnet_rewrite header */
+ u8 * rewrite;
+
+ /* decap next index */
+ u32 decap_next_index;
+
+ /* tunnel src and dst addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+
+ /* FIB indices */
+ u32 encap_fib_index; /* tunnel partner lookup here */
+ u32 decap_fib_index; /* inner IP lookup here */
+
+ /* vnet intfc hw/sw_if_index */
+ u32 hw_if_index;
+ u32 sw_if_index;
+
+ /* LISP header fields in HOST byte order */
+ u8 flags;
+ u8 ver_res;
+ u8 res;
+ u8 next_protocol;
+ u32 iid;
+} lisp_gpe_tunnel_t;
+
+#define foreach_lisp_gpe_input_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input") \
+_(ETHERNET_INPUT, "ethernet-input") \
+_(LISP_GPE_ENCAP, "lisp-gpe-encap")
+
+typedef enum {
+#define _(s,n) LISP_GPE_INPUT_NEXT_##s,
+ foreach_lisp_gpe_input_next
+#undef _
+ LISP_GPE_INPUT_N_NEXT,
+} lisp_gpe_input_next_t;
+
+typedef enum {
+#define lisp_gpe_error(n,s) LISP_GPE_ERROR_##n,
+#include <vnet/lisp-gpe/lisp_gpe_error.def>
+#undef lisp_gpe_error
+ LISP_GPE_N_ERROR,
+} lisp_gpe_input_error_t;
+
+typedef struct {
+ /* vector of encap tunnel instances */
+ lisp_gpe_tunnel_t *tunnels;
+
+ /* lookup tunnel by key */
+ uword * lisp_gpe_tunnel_by_key;
+
+ /* Free vlib hw_if_indices */
+ u32 * free_lisp_gpe_tunnel_hw_if_indices;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} lisp_gpe_main_t;
+
+lisp_gpe_main_t lisp_gpe_main;
+
+vlib_node_registration_t lisp_gpe_input_node;
+vlib_node_registration_t lisp_gpe_encap_node;
+
+u8 * format_lisp_gpe_encap_trace (u8 * s, va_list * args);
+u8 * format_lisp_gpe_header_with_length (u8 * s, va_list * args);
+
+typedef struct {
+ u8 is_add;
+ ip4_address_t src, dst;
+ u32 encap_fib_index;
+ u32 decap_fib_index;
+ u32 decap_next_index;
+ u8 flags;
+ u8 ver_res;
+ u8 res;
+ u8 next_protocol;
+ u32 iid; /* host byte order */
+} vnet_lisp_gpe_add_del_tunnel_args_t;
+
+int vnet_lisp_gpe_add_del_tunnel
+(vnet_lisp_gpe_add_del_tunnel_args_t *a, u32 * sw_if_indexp);
+
+u8 * format_lisp_gpe_header_with_length (u8 * s, va_list * args);
+
+#endif /* included_vnet_lisp_gpe_h */
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_error.def b/vnet/vnet/lisp-gpe/lisp_gpe_error.def
new file mode 100644
index 00000000000..6ef894f474d
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/lisp_gpe_error.def
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+lisp_gpe_error (DECAPSULATED, "good packets decapsulated")
+lisp_gpe_error (NO_SUCH_TUNNEL, "no such tunnel packets")
diff --git a/vnet/vnet/lisp-gpe/lisp_gpe_packet.h b/vnet/vnet/lisp-gpe/lisp_gpe_packet.h
new file mode 100644
index 00000000000..b3d96ed9d44
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/lisp_gpe_packet.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_lisp_gpe_packet_h
+#define included_lisp_gpe_packet_h
+
+/*
+ * From draft-lewis-lisp-gpe-02.txt
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|L|E|V|I|P|R|O|Ver| Reserved | Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Instance ID/Locator-Status-Bits |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * N: The N-bit is the nonce-present bit. When this bit is set to 1,
+ * the low-order 24 bits of the first 32 bits of the LISP header
+ * contain a Nonce. See Section 6.3.1 for details. Both N- and
+ * V-bits MUST NOT be set in the same packet. If they are, a
+ * decapsulating ETR MUST treat the 'Nonce/Map-Version' field as
+ * having a Nonce value present.
+ *
+ * L: The L-bit is the 'Locator-Status-Bits' field enabled bit. When
+ * this bit is set to 1, the Locator-Status-Bits in the second
+ * 32 bits of the LISP header are in use.
+ *
+ * E: The E-bit is the echo-nonce-request bit. This bit MUST be ignored
+ * and has no meaning when the N-bit is set to 0. When the N-bit is
+ * set to 1 and this bit is set to 1, an ITR is requesting that the
+ * nonce value in the 'Nonce' field be echoed back in LISP-
+ * encapsulated packets when the ITR is also an ETR. See
+ * Section 6.3.1 for details.
+ *
+ * V: The V-bit is the Map-Version present bit. When this bit is set to
+ * 1, the N-bit MUST be 0. Refer to Section 6.6.3 for more details.
+ *
+ * I: The I-bit is the Instance ID bit. See Section 5.5 for more
+ * details. When this bit is set to 1, the 'Locator-Status-Bits'
+ * field is reduced to 8 bits and the high-order 24 bits are used as
+ * an Instance ID. If the L-bit is set to 0, then the low-order
+ * 8 bits are transmitted as zero and ignored on receipt.
+ *
+ * P Bit: Flag bit 5 is defined as the Next Protocol bit. The P bit
+ * MUST be set to 1 to indicate the presence of the 8 bit next
+ * protocol field.
+ *
+ * P = 0 indicates that the payload MUST conform to LISP as defined
+ * in [RFC6830].
+ *
+ * Flag bit 5 was chosen as the P bit because this flag bit is
+ * currently unallocated in LISP [RFC6830].
+ *
+ * O: Flag bit 7 is defined as the O bit. When the O bit is set to 1, the
+ * packet is an OAM packet and OAM processing MUST occur. The OAM
+ * protocol details are out of scope for this document. As with the
+ * P-bit, bit 7 is currently a reserved flag in [RFC6830].
+ *
+ * Next Protocol Field: The lower 8 bits of the first word are used to
+ * carry a next protocol. This next protocol field contains the
+ * protocol of the encapsulated payload packet.
+ *
+ * LISP [RFC6830] uses the lower 16 bits of the first word for either
+ * a nonce, an echo-nonce ([RFC6830]) or to support map-versioning
+ * ([RFC6834]). These are all optional capabilities that are
+ * indicated by setting the N, E, and the V bit respectively.
+ *
+ * To maintain the desired data plane compatibility, when the P bit
+ * is set, the N, E, and V bits MUST be set to zero.
+ *
+ * A new protocol registry will be requested from IANA for the Next
+ * Protocol field. This draft defines the following Next Protocol
+ * values:
+ *
+ * 0x1 : IPv4
+ * 0x2 : IPv6
+ * 0x3 : Ethernet
+ * 0x4: Network Service Header
+ */
+
+typedef struct {
+ u8 flags;
+ u8 ver_res;
+ u8 res;
+ u8 next_protocol;
+ u32 iid;
+} lisp_gpe_header_t;
+
+#define foreach_lisp_gpe_flag_bit \
+_(N, 0x80) \
+_(L, 0x40) \
+_(E, 0x20) \
+_(V, 0x10) \
+_(I, 0x08) \
+_(P, 0x04) \
+_(O, 0x01)
+
+typedef enum {
+#define _(n,v) LISP_GPE_FLAGS_##n = v,
+foreach_lisp_gpe_flag_bit
+#undef _
+} vnet_lisp_gpe_flag_bit_t;
+
+#define LISP_GPE_VERSION 0x0
+
+#define LISP_GPE_NEXT_PROTOCOL_IP4 0x1
+#define LISP_GPE_NEXT_PROTOCOL_IP6 0x2
+#define LISP_GPE_NEXT_PROTOCOL_ETHERNET 0x3
+#define LISP_GPE_NEXT_PROTOCOL_NSH 0x4
+
+#endif /* included_lisp_gpe_packet_h */
diff --git a/vnet/vnet/lisp-gpe/rfc.txt b/vnet/vnet/lisp-gpe/rfc.txt
new file mode 100644
index 00000000000..5e3da150c70
--- /dev/null
+++ b/vnet/vnet/lisp-gpe/rfc.txt
@@ -0,0 +1,826 @@
+Network Working Group D. Lewis
+Internet-Draft Cisco Systems, Inc.
+Intended status: Informational P. Agarwal
+Expires: January 5, 2015 Broadcom
+ L. Kreeger
+ F. Maino
+ P. Quinn
+ M. Smith
+ N. Yadav
+ Cisco Systems, Inc.
+ July 4, 2014
+
+
+ LISP Generic Protocol Extension
+ draft-lewis-lisp-gpe-02.txt
+
+Abstract
+
+ This draft describes extending the Locator/ID Separation Protocol
+ (LISP) [RFC6830], via changes to the LISP header, with three new
+ capabilities: support for multi-protocol encapsulation, operations,
+ administration and management (OAM) signaling, and explicit
+ versioning.
+
+Status of this Memo
+
+ This Internet-Draft is submitted in full conformance with the
+ provisions of BCP 78 and BCP 79.
+
+ Internet-Drafts are working documents of the Internet Engineering
+ Task Force (IETF). Note that other groups may also distribute
+ working documents as Internet-Drafts. The list of current Internet-
+ Drafts is at http://datatracker.ietf.org/drafts/current/.
+
+ Internet-Drafts are draft documents valid for a maximum of six months
+ and may be updated, replaced, or obsoleted by other documents at any
+ time. It is inappropriate to use Internet-Drafts as reference
+ material or to cite them other than as "work in progress."
+
+ This Internet-Draft will expire on January 5, 2015.
+
+Copyright Notice
+
+ Copyright (c) 2014 IETF Trust and the persons identified as the
+ document authors. All rights reserved.
+
+ This document is subject to BCP 78 and the IETF Trust's Legal
+ Provisions Relating to IETF Documents
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 1]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+ (http://trustee.ietf.org/license-info) in effect on the date of
+ publication of this document. Please review these documents
+ carefully, as they describe your rights and restrictions with respect
+ to this document. Code Components extracted from this document must
+ include Simplified BSD License text as described in Section 4.e of
+ the Trust Legal Provisions and are provided without warranty as
+ described in the Simplified BSD License.
+
+
+Table of Contents
+
+ 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 3
+ 2. LISP Header Without Protocol Extensions . . . . . . . . . . . 4
+ 3. Generic Protocol Extension for LISP (LISP-gpe) . . . . . . . . 5
+ 3.1. Multi Protocol Support . . . . . . . . . . . . . . . . . . 5
+ 3.2. OAM Support . . . . . . . . . . . . . . . . . . . . . . . 6
+ 3.3. Version Bits . . . . . . . . . . . . . . . . . . . . . . . 6
+ 4. Backward Compatibility . . . . . . . . . . . . . . . . . . . . 8
+ 4.1. LISP-gpe Routers to (legacy) LISP Routers . . . . . . . . 8
+ 4.2. (legacy) LISP Routers to LISP-gpe Routers . . . . . . . . 8
+ 4.3. Type of Service . . . . . . . . . . . . . . . . . . . . . 8
+ 4.4. VLAN Identifier (VID) . . . . . . . . . . . . . . . . . . 8
+ 5. LISP-gpe Examples . . . . . . . . . . . . . . . . . . . . . . 9
+ 6. Security Considerations . . . . . . . . . . . . . . . . . . . 11
+ 7. Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . 12
+ 8. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 13
+ 9. References . . . . . . . . . . . . . . . . . . . . . . . . . . 14
+ 9.1. Normative References . . . . . . . . . . . . . . . . . . . 14
+ 9.2. Informative References . . . . . . . . . . . . . . . . . . 14
+ Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . . 15
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 2]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+1. Introduction
+
+ LISP [RFC6830] defines an encapsulation format that carries IPv4 or
+ IPv6 (henceforth referred to as IP) packets in a LISP header and
+ outer UDP/IP transport.
+
+ The LISP header does not specify the protocol being encapsulated and
+ therefore is currently limited to encapsulating only IP packet
+ payloads. Other protocols, most notably VXLAN [VXLAN] (which defines
+ a similar header format to LISP), are used to encapsulate L2
+ protocols such as Ethernet. LISP [RFC6830] can be extended to
+ indicate the inner protocol, enabling the encapsulation of Ethernet,
+ IP or any other desired protocol all the while ensuring compatibility
+ with existing LISP [RFC6830] deployments.
+
+ As LISP is deployed, there's also the need to provide increased
+ visibility and diagnostic capabilities within the overlay.
+
+ This document describes extending LISP ([RFC6830]) via the following
+ changes:
+
+ Next Protocol Bit (P bit): A reserved flag bit is allocated, and set
+ in the LISP-gpe header to indicate that a next protocol field is
+ present.
+
+ OAM Flag Bit (O bit): A reserved flag bit is allocated, and set in
+ the LISP-gpe header, to indicate that the packet is an OAM packet.
+
+ Version: Two reserved bits are allocated, and set in the LISP-gpe
+ header, to indicate LISP-gpe protocol version.
+
+ Next protocol: An 8 bit next protocol field is present in the LISP-
+ gpe header.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 3]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+2. LISP Header Without Protocol Extensions
+
+ As described in the introduction, the LISP header has no protocol
+ identifier that indicates the type of payload being carried by LISP.
+ Because of this, LISP is limited to an IP payload. Furthermore, the
+ LISP header has no mechanism to signal OAM packets.
+
+ The LISP header contains flags (some defined, some reserved), a
+ Nonce/Map-version field and an instance ID/Locator-status-bit field.
+ The flags provide flexibility to define how the reserved bits can be
+ used to change the definition of the LISP header.
+
+
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|flags| Nonce/Map-Version |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+ Figure 1: LISP Header
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 4]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+3. Generic Protocol Extension for LISP (LISP-gpe)
+
+3.1. Multi Protocol Support
+
+ This draft defines the following changes to the LISP header in order
+ to support multi-protocol encapsulation.
+
+ P Bit: Flag bit 5 is defined as the Next Protocol bit. The P bit
+ MUST be set to 1 to indicate the presence of the 8 bit next
+ protocol field.
+
+ P = 0 indicates that the payload MUST conform to LISP as defined
+ in [RFC6830].
+
+ Flag bit 5 was chosen as the P bit because this flag bit is
+ currently unallocated in LISP [RFC6830].
+
+ Next Protocol Field: The lower 8 bits of the first word are used to
+ carry a next protocol. This next protocol field contains the
+ protocol of the encapsulated payload packet.
+
+ LISP [RFC6830] uses the lower 16 bits of the first word for either
+ a nonce, an echo-nonce ([RFC6830]) or to support map-versioning
+ ([RFC6834]). These are all optional capabilities that are
+ indicated by setting the N, E, and the V bit respectively.
+
+ To maintain the desired data plane compatibility, when the P bit
+ is set, the N, E, and V bits MUST be set to zero.
+
+ A new protocol registry will be requested from IANA for the Next
+ Protocol field. This draft defines the following Next Protocol
+ values:
+
+ 0x1 : IPv4
+
+ 0x2 : IPv6
+
+ 0x3 : Ethernet
+
+ 0x4: Network Service Header
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 5]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|P|R|R| Reserved | Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 2: LISP-gpe Next Protocol (P=1)
+
+3.2. OAM Support
+
+ Flag bit 7 is defined as the O bit. When the O bit is set to 1, the
+ packet is an OAM packet and OAM processing MUST occur. The OAM
+ protocol details are out of scope for this document. As with the
+ P-bit, bit 7 is currently a reserved flag in [RFC6830].
+
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|P|R|O| Reserved | Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 3: LISP-gpe OAM bit (P=1)
+
+3.3. Version Bits
+
+ LISP-gpe bits8 and 9 are defined as version bits. The version field
+ is used to ensure backward compatibility going forward with future
+ LISP-gpe updates.
+
+ The initial version for LISP-gpe is 0.
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 6]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|P|R|O|Ver| Reserved | Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 4: LISP-gpe Version bits (P=1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 7]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+4. Backward Compatibility
+
+ Undefined (in RFC6830) flag bits 5 and 7, LISP-gpe P and O bits, were
+ selected to ensure compatibility with existing LISP [RFC6830]
+ deployments.
+
+ Similarly, using P = 0 to indicate that the format of the header and
+ payload conforms to [RFC6830] ensures compatibility with existing
+ LISP hardware forwarding platforms.
+
+4.1. LISP-gpe Routers to (legacy) LISP Routers
+
+ A LISP-gpe router MUST not encapsulate non-IP packet nor OAM packets
+ to a LISP router. A method for determining the capabilities of a
+ LISP router (gpe or "legacy") is out of the scope of this draft.
+
+ When encapsulating IP packets to a LISP router the P bit SHOULD be
+ set to 1 and the UDP port MUST be set to 4341. OAM bit MUST be set
+ to 0. The Next Protocol field SHOULD be 0x1 (IPv4) or 0x2 (IPv6).
+ The (legacy) LISP router will ignore the P bit and the protocol type
+ field. The (legacy) LISP router will treat the packet as a LISP
+ packet and inspect the first nibble of the payload to determine the
+ IP version.
+
+ When the P bit is set, the N, E, and V bits MUST be set to zero. The
+ receiving (legacy) LISP router will ignore N, E and V bits, when the
+ P bit is set.
+
+4.2. (legacy) LISP Routers to LISP-gpe Routers
+
+ When a LISP-gpe router receives a packet from a (legacy) LISP router,
+ the P bit MUST not be set and the UDP port MUST be 4341. The payload
+ MUST be IP, and the LISP-gpe router will inspect the first nibble of
+ the payload to determine IP version.
+
+4.3. Type of Service
+
+ When a LISP-gpe router performs Ethernet encapsulation, the inner
+ 802.1Q [IEEE8021Q] priority code point (PCP) field MAY be mapped from
+ the encapsulated frame to the Type of Service field in the outer IPv4
+ header, or in the case of IPv6 the 'Traffic Class' field.
+
+4.4. VLAN Identifier (VID)
+
+ When a LISP-gpe router performs Ethernet encapsulation, the inner
+ header 802.1Q [IEEE8021Q] VLAN Identifier (VID) MAY be mapped to, or
+ used to determine the LISP Instance ID field.
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 8]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+5. LISP-gpe Examples
+
+ This section provides two examples of IP protocols, and one example
+ of Ethernet encapsulated LISP-gpe using the generic extension
+ described in this document.
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|1|0|0|0| Reserved | NP = IPv4 |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original IPv4 Packet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 5: IPv4 and LISP-gpe
+
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|1|0|0|0| Reserved | NP = IPv6 |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original IPv6 Packet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 6: IPv6 and LISP-gpe
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 9]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |N|L|E|V|I|1|0|0|0| Reserved | NP = Ethernet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Instance ID/Locator-Status-Bits |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original Ethernet Frame |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 7: Ethernet and LISP-gpe
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 10]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+6. Security Considerations
+
+ LISP-gpe security considerations are similar to the LISP security
+ considerations documented at length in LISP [RFC6830]. With LISP-
+ gpe, issues such as dataplane spoofing, flooding, and traffic
+ redirection are dependent on the particular protocol payload
+ encapsulated.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 11]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+7. Acknowledgments
+
+ A special thank you goes to Dino Farinacci for his guidance and
+ detailed review.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 12]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+8. IANA Considerations
+
+ IANA is requested to set up a registry of "Next Protocol". These are
+ 8-bit values. Next Protocol values 0, 1, 2, 3 and 4 are defined in
+ this draft. New values are assigned via Standards Action [RFC5226].
+
+ +---------------+-------------+---------------+
+ | Next Protocol | Description | Reference |
+ +---------------+-------------+---------------+
+ | 0 | Reserved | This document |
+ | | | |
+ | 1 | IPv4 | This document |
+ | | | |
+ | 2 | IPv6 | This document |
+ | | | |
+ | 3 | Ethernet | This document |
+ | | | |
+ | 4 | NSH | This document |
+ | | | |
+ | 5..253 | Unassigned | |
+ +---------------+-------------+---------------+
+
+ Table 1
+
+ There are ten bits at the beginning of the LISP-gpe header. New
+ bits are assigned via Standards Action [RFC5226].
+
+ Bits 0-3 - Assigned by LISP [RFC6830]
+ Bit 4 - Instance ID (I bit)
+ Bit 5 - Next Protocol (P bit)
+ Bit 6 - Reserved
+ Bit 7 - OAM (O bit)
+ Bits 8-9 - Version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 13]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+9. References
+
+9.1. Normative References
+
+ [RFC0768] Postel, J., "User Datagram Protocol", STD 6, RFC 768,
+ August 1980.
+
+ [RFC0791] Postel, J., "Internet Protocol", STD 5, RFC 791,
+ September 1981.
+
+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
+ Requirement Levels", BCP 14, RFC 2119, March 1997.
+
+ [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an
+ IANA Considerations Section in RFCs", BCP 26, RFC 5226,
+ May 2008.
+
+9.2. Informative References
+
+ [ETYPES] The IEEE Registration Authority, "IEEE 802 Numbers", 2012,
+ <http://www.iana.org/assignments/ieee-802-numbers/
+ ieee-802-numbers.xml>.
+
+ [IEEE8021Q]
+ The IEEE Computer Society, "Media Access Control (MAC)
+ Bridges and Virtual Bridge Local Area Networks", August
+ 2012, <http://standards.ieee.org/getieee802/download/
+ 802.1Q-2011.pdf>.
+
+ [RFC1700] Reynolds, J. and J. Postel, "Assigned Numbers", RFC 1700,
+ October 1994.
+
+ [RFC6830] Farinacci, D., Fuller, V., Meyer, D., and D. Lewis, "The
+ Locator/ID Separation Protocol (LISP)", RFC 6830,
+ January 2013.
+
+ [RFC6834] Iannone, L., Saucez, D., and O. Bonaventure, "Locator/ID
+ Separation Protocol (LISP) Map-Versioning", RFC 6834,
+ January 2013.
+
+ [VXLAN] Dutt, D., Mahalingam, M., Duda, K., Agarwal, P., Kreeger,
+ L., Sridhar, T., Bursell, M., and C. Wright, "VXLAN: A
+ Framework for Overlaying Virtualized Layer 2 Networks over
+ Layer 3 Networks", 2013.
+
+
+
+
+
+
+
+Lewis, et al. Expires January 5, 2015 [Page 14]
+
+Internet-Draft LISP Generic Protocol Extension July 2014
+
+
+Authors' Addresses
+
+ Darrel Lewis
+ Cisco Systems, Inc.
+
+ Email: darlewis@cisco.com
+
+
+ Puneet Agarwal
+ Broadcom
+
+ Email: pagarwal@broadcom.com
+
+
+ Larry Kreeger
+ Cisco Systems, Inc.
+
+ Email: kreeger@cisco.com
+
+
+ Fabio Maino
+ Cisco Systems, Inc.
+
+ Email: fmaino@cisco.com
+
+
+ Paul Quinn
+ Cisco Systems, Inc.
+
+ Email: paulq@cisco.com
+
+
+ Michael Smith
+ Cisco Systems, Inc.
+
+ Email: michsmit@cisco.com
+
+
+ Navindra Yadav
+ Cisco Systems, Inc.
+
+ Email: nyadav@cisco.com
diff --git a/vnet/vnet/llc/llc.c b/vnet/vnet/llc/llc.c
new file mode 100644
index 00000000000..f3b464f1ace
--- /dev/null
+++ b/vnet/vnet/llc/llc.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * llc.c: llc support
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/llc/llc.h>
+
+/* Global main structure. */
+llc_main_t llc_main;
+
+u8 * format_llc_protocol (u8 * s, va_list * args)
+{
+ llc_protocol_t p = va_arg (*args, u32);
+ llc_main_t * pm = &llc_main;
+ llc_protocol_info_t * pi = llc_get_protocol_info (pm, p);
+
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "0x%02x", p);
+
+ return s;
+}
+
+u8 * format_llc_header_with_length (u8 * s, va_list * args)
+{
+ llc_main_t * pm = &llc_main;
+ llc_header_t * h = va_arg (*args, llc_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ llc_protocol_t p = h->dst_sap;
+ uword indent, header_bytes;
+
+ header_bytes = llc_header_length (h);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "llc header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "LLC %U -> %U",
+ format_llc_protocol, h->src_sap,
+ format_llc_protocol, h->dst_sap);
+
+ if (h->control != 0x03)
+ s = format (s, ", control 0x%x", llc_header_get_control (h));
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ {
+ llc_protocol_info_t * pi = llc_get_protocol_info (pm, p);
+ vlib_node_t * node = vlib_get_node (pm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_llc_header (u8 * s, va_list * args)
+{
+ llc_header_t * h = va_arg (*args, llc_header_t *);
+ return format (s, "%U", format_llc_header_with_length, h, 0);
+}
+
+/* Returns llc protocol as an int in host byte order. */
+uword
+unformat_llc_protocol (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ llc_main_t * pm = &llc_main;
+ int p, i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x", &p)
+ || unformat (input, "%d", &p))
+ {
+ if (p >= (1 << 8))
+ return 0;
+ *result = p;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ pm->protocol_info_by_name, &i))
+ {
+ llc_protocol_info_t * pi = vec_elt_at_index (pm->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_llc_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ llc_header_t _h, * h = &_h;
+ u8 p;
+
+ if (! unformat (input, "%U", unformat_llc_protocol, &p))
+ return 0;
+
+ h->src_sap = h->dst_sap = p;
+ h->control = 0x3;
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static uword llc_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ llc_header_t * h = rewrite;
+ llc_protocol_t protocol;
+
+ if (max_rewrite_bytes < sizeof (h[0]))
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: protocol = LLC_PROTOCOL_##b; break
+ _ (IP4, ip4);
+#undef _
+ default:
+ return 0;
+ }
+
+ h->src_sap = h->dst_sap = protocol;
+ h->control = 0x3;
+
+ return sizeof (h[0]);
+}
+
+VNET_HW_INTERFACE_CLASS (llc_hw_interface_class) = {
+ .name = "LLC",
+ .format_header = format_llc_header_with_length,
+ .unformat_header = unformat_llc_header,
+ .set_rewrite = llc_set_rewrite,
+};
+
+static void add_protocol (llc_main_t * pm,
+ llc_protocol_t protocol,
+ char * protocol_name)
+{
+ llc_protocol_info_t * pi;
+ u32 i;
+
+ vec_add2 (pm->protocol_infos, pi, 1);
+ i = pi - pm->protocol_infos;
+
+ pi->name = protocol_name;
+ pi->protocol = protocol;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (pm->protocol_info_by_protocol, protocol, i);
+ hash_set_mem (pm->protocol_info_by_name, pi->name, i);
+}
+
+static clib_error_t * llc_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ llc_main_t * pm = &llc_main;
+
+ memset (pm, 0, sizeof (pm[0]));
+ pm->vlib_main = vm;
+
+ pm->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ pm->protocol_info_by_protocol = hash_create (0, sizeof (uword));
+
+#define _(f,n) add_protocol (pm, LLC_PROTOCOL_##f, #f);
+ foreach_llc_protocol;
+#undef _
+
+ if ((error = vlib_call_init_function (vm, snap_init)))
+ return error;
+
+ return vlib_call_init_function (vm, llc_input_init);
+}
+
+VLIB_INIT_FUNCTION (llc_init);
+
diff --git a/vnet/vnet/llc/llc.h b/vnet/vnet/llc/llc.h
new file mode 100644
index 00000000000..80131be5b03
--- /dev/null
+++ b/vnet/vnet/llc/llc.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * llc.h: LLC definitions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_llc_h
+#define included_llc_h
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+
+/* Protocol (SSAP/DSAP) types. */
+#define foreach_llc_protocol \
+ _ (null, 0x0) \
+ _ (sublayer, 0x2) \
+ _ (sna_path_control, 0x4) \
+ _ (ip4, 0x6) \
+ _ (sna1, 0x8) \
+ _ (sna2, 0xc) \
+ _ (sna3, 0x40) \
+ _ (proway_lan, 0x0e) \
+ _ (netware1, 0x10) \
+ _ (netware2, 0xe0) \
+ _ (osi_layer1, 0x14) \
+ _ (osi_layer2, 0x20) \
+ _ (osi_layer3, 0x34) \
+ _ (osi_layer4, 0x54) \
+ _ (osi_layer5, 0xfe) \
+ _ (bpdu, 0x42) \
+ _ (arp, 0x98) \
+ _ (snap, 0xaa) \
+ _ (vines1, 0xba) \
+ _ (vines2, 0xbc) \
+ _ (netbios, 0xf0) \
+ _ (global_dsap, 0xff)
+
+typedef enum {
+#define _(f,n) LLC_PROTOCOL_##f = n,
+ foreach_llc_protocol
+#undef _
+} llc_protocol_t;
+
+typedef struct {
+#define LLC_DST_SAP_IS_GROUP (1 << 0)
+#define LLC_SRC_SAP_IS_RESPONSE (1 << 0)
+ u8 dst_sap, src_sap;
+
+ /* Control byte.
+ [0] 1 => supervisory 0 => information
+ [1] unnumbered frame. */
+ u8 control;
+
+ /* Only present if (control & 3) != 3. */
+ u8 extended_control[0];
+} llc_header_t;
+
+always_inline u16
+llc_header_get_control (llc_header_t * h)
+{
+ u16 r = h->control;
+ return r | ((((r & 3) != 3) ? h->extended_control[0] : 0) << 8);
+}
+
+always_inline u8
+llc_header_length (llc_header_t * h)
+{
+ return ((h->control & 3) != 3 ? 4 : 3);
+}
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* LLC protocol (SAP type). */
+ llc_protocol_t protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} llc_protocol_info_t;
+
+#define foreach_llc_error \
+ _ (NONE, "no error") \
+ _ (UNKNOWN_PROTOCOL, "unknown llc ssap/dsap") \
+ _ (UNKNOWN_CONTROL, "control != 0x3")
+
+typedef enum {
+#define _(f,s) LLC_ERROR_##f,
+ foreach_llc_error
+#undef _
+ LLC_N_ERROR,
+} llc_error_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ llc_protocol_info_t * protocol_infos;
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * protocol_info_by_name, * protocol_info_by_protocol;
+
+ /* llc-input next index indexed by protocol. */
+ u8 input_next_by_protocol[256];
+} llc_main_t;
+
+always_inline llc_protocol_info_t *
+llc_get_protocol_info (llc_main_t * m, llc_protocol_t protocol)
+{
+ uword * p = hash_get (m->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (m->protocol_infos, p[0]) : 0;
+}
+
+extern llc_main_t llc_main;
+
+/* Register given node index to take input for given llc type. */
+void
+llc_register_input_protocol (vlib_main_t * vm,
+ llc_protocol_t protocol,
+ u32 node_index);
+
+void llc_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ llc_protocol_t protocol);
+
+format_function_t format_llc_protocol;
+format_function_t format_llc_header;
+format_function_t format_llc_header_with_length;
+
+/* Parse llc protocol as 0xXXXX or protocol name. */
+unformat_function_t unformat_llc_protocol;
+
+/* Parse llc header. */
+unformat_function_t unformat_llc_header;
+unformat_function_t unformat_pg_llc_header;
+
+always_inline void
+llc_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_llc_header_with_length;
+ n->unformat_buffer = unformat_llc_header;
+ pn->unformat_edit = unformat_pg_llc_header;
+}
+
+#endif /* included_llc_h */
diff --git a/vnet/vnet/llc/node.c b/vnet/vnet/llc/node.c
new file mode 100644
index 00000000000..1e54a53cc9c
--- /dev/null
+++ b/vnet/vnet/llc/node.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * llc_node.c: llc packet processing
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/llc/llc.h>
+
+#define foreach_llc_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) LLC_INPUT_NEXT_##s,
+ foreach_llc_input_next
+#undef _
+ LLC_INPUT_N_NEXT,
+} llc_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} llc_input_trace_t;
+
+static u8 * format_llc_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ llc_input_trace_t * t = va_arg (*va, llc_input_trace_t *);
+
+ s = format (s, "%U", format_llc_header, t->packet_data);
+
+ return s;
+}
+
+static uword
+llc_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ llc_main_t * lm = &llc_main;
+ u32 n_left_from, next_index, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (llc_input_trace_t));
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ llc_header_t * h0, * h1;
+ u8 next0, next1, len0, len1, enqueue_code;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+
+ CLIB_PREFETCH (b2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = (void *) (b0->data + b0->current_data);
+ h1 = (void *) (b1->data + b1->current_data);
+
+ len0 = llc_header_length (h0);
+ len1 = llc_header_length (h1);
+
+ b0->current_data += len0;
+ b1->current_data += len1;
+
+ b0->current_length -= len0;
+ b1->current_length -= len1;
+
+ next0 = lm->input_next_by_protocol[h0->dst_sap];
+ next1 = lm->input_next_by_protocol[h1->dst_sap];
+
+ b0->error = node->errors[next0 == LLC_INPUT_NEXT_DROP ? LLC_ERROR_UNKNOWN_PROTOCOL : LLC_ERROR_NONE];
+ b1->error = node->errors[next1 == LLC_INPUT_NEXT_DROP ? LLC_ERROR_UNKNOWN_PROTOCOL : LLC_ERROR_NONE];
+
+ enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = bi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ if (next0 == next1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ next_index = next1;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ llc_header_t * h0;
+ u8 next0, len0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ len0 = llc_header_length (h0);
+
+ b0->current_data += len0;
+
+ b0->current_length -= len0;
+
+ next0 = lm->input_next_by_protocol[h0->dst_sap];
+
+ b0->error = node->errors[next0 == LLC_INPUT_NEXT_DROP ? LLC_ERROR_UNKNOWN_PROTOCOL : LLC_ERROR_NONE];
+
+ /* Sent packet to wrong next? */
+ if (PREDICT_FALSE (next0 != next_index))
+ {
+ /* Return old frame; remove incorrectly enqueued packet. */
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);
+
+ /* Send to correct next. */
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * llc_error_strings[] = {
+#define _(f,s) s,
+ foreach_llc_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (llc_input_node) = {
+ .function = llc_input,
+ .name = "llc-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = LLC_N_ERROR,
+ .error_strings = llc_error_strings,
+
+ .n_next_nodes = LLC_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [LLC_INPUT_NEXT_##s] = n,
+ foreach_llc_input_next
+#undef _
+ },
+
+ .format_buffer = format_llc_header_with_length,
+ .format_trace = format_llc_input_trace,
+ .unformat_buffer = unformat_llc_header,
+};
+
+static clib_error_t * llc_input_init (vlib_main_t * vm)
+{
+ llc_main_t * lm = &llc_main;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, llc_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ llc_setup_node (vm, llc_input_node.index);
+
+ {
+ int i;
+ for (i = 0; i < ARRAY_LEN (lm->input_next_by_protocol); i++)
+ lm->input_next_by_protocol[i] = LLC_INPUT_NEXT_DROP;
+ }
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (llc_input_init);
+
+void
+llc_register_input_protocol (vlib_main_t * vm,
+ llc_protocol_t protocol,
+ u32 node_index)
+{
+ llc_main_t * lm = &llc_main;
+ llc_protocol_info_t * pi;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, llc_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = llc_get_protocol_info (lm, protocol);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ llc_input_node.index,
+ node_index);
+
+ lm->input_next_by_protocol[protocol] = pi->next_index;
+}
diff --git a/vnet/vnet/llc/pg.c b/vnet/vnet/llc/pg.c
new file mode 100644
index 00000000000..eb6c6a18b27
--- /dev/null
+++ b/vnet/vnet/llc/pg.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * llc_pg.c: packet generator llc interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/llc/llc.h>
+
+typedef struct {
+ pg_edit_t dst_sap;
+ pg_edit_t src_sap;
+ pg_edit_t control;
+} pg_llc_header_t;
+
+static inline void
+pg_llc_header_init (pg_llc_header_t * e)
+{
+ pg_edit_init (&e->dst_sap, llc_header_t, dst_sap);
+ pg_edit_init (&e->src_sap, llc_header_t, src_sap);
+ pg_edit_init (&e->control, llc_header_t, control);
+}
+
+uword
+unformat_pg_llc_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_llc_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (llc_header_t),
+ &group_index);
+ pg_llc_header_init (h);
+
+ pg_edit_set_fixed (&h->control, 0x03);
+
+ error = 1;
+ if (! unformat (input, "%U -> %U",
+ unformat_pg_edit,
+ unformat_llc_protocol, &h->src_sap, &h->dst_sap))
+ goto done;
+
+ {
+ llc_main_t * pm = &llc_main;
+ llc_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->dst_sap.type == PG_EDIT_FIXED)
+ {
+ u8 t = *h->dst_sap.values[PG_EDIT_LO];
+ pi = llc_get_protocol_info (pm, t);
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/map/examples/gen-rules.py b/vnet/vnet/map/examples/gen-rules.py
new file mode 100755
index 00000000000..d6746f79af4
--- /dev/null
+++ b/vnet/vnet/map/examples/gen-rules.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3.4
+
+# Copyright (c) 2015 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ipaddress
+import argparse
+import sys
+
+# map add domain ip4-pfx <pfx> ip6-pfx ::/0 ip6-src <ip6-src> ea-bits-len 0 psid-offset 6 psid-len 6
+# map add rule index <0> psid <psid> ip6-dst <ip6-dst>
+
+parser = argparse.ArgumentParser(description='MAP VPP configuration generator')
+parser.add_argument('-t', action="store", dest="mapmode")
+args = parser.parse_args()
+
+#
+# 1:1 Shared IPv4 address, shared BR, Terastream
+#
+def terastream():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/22')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ ip6_src = ipaddress.ip_address('cccc:bbbb::')
+ for i in range(ip4_pfx.num_addresses):
+ if not i % 64:
+ ip6_src = ip6_src + 1
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src " + str(ip6_src) +
+ " ea-bits-len 0 psid-offset 0 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Shared IPv4 address, shared BR, OTE
+#
+def oteshared11():
+ ip4_pfx = ipaddress.ip_network('2.84.63.0/24')
+ dst = list(ipaddress.ip_network('2a02:580:8c00::/40').subnets(new_prefix=56))
+ psid_len = 6
+ ip6_src = ipaddress.ip_address('2a02::')
+ for i in range(ip4_pfx.num_addresses):
+ if not i % 64:
+ ip6_src = ip6_src + 1
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src " + str(ip6_src) +
+ " ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ enduserprefix = list(dst.pop(0).subnets(new_prefix=64))[255-1]
+ print("map add rule index", i, "psid", psid, "ip6-dst", enduserprefix[(i * (0x1<<psid_len)) + psid])
+
+
+#
+# 1:1 Shared IPv4 address, shared BR, Terastream
+#
+def confdterastream():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/22')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ ip6_src = ipaddress.ip_address('cccc:bbbb::')
+ for i in range(ip4_pfx.num_addresses):
+ if not i % 64:
+ ip6_src = ip6_src + 1
+ print("vpp softwire softwire-instances softwire-instance", i, "br-ipv6 " + str(ip6_src) + " ipv6-prefix ::/0" + " ipv4-prefix " + str(ip4_pfx[i]) +
+ "/32 ea-len 0 psid-offset 6 psid-len", psid_len)
+# print("vpp softwire softwire-instances softwire-instance", i, "ipv4-pfx " + str(ip4_pfx[i]) + "/32 ipv6-pfx ::/0 br-ipv6 " + str(ip6_src) +
+# " ea-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("binding", psid, "ipv6-addr", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+def shared11br_yang():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("vpp softwire softwire-instances softwire-instance " + str(i) + " ipv4-prefix " + str(ip4_pfx[i]) + "/32 " +
+ "ipv6-prefix ::/0 ea-len 0 psid-offset 6 tunnel-mtu 1234 psid-len", psid_len)
+ #print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-shared-src cccc:bbbb::1",
+ # "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ # print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+ print("binding", psid, "ipv6-addr", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+def shared11br_xml():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/32')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ ip6_src = ipaddress.ip_address('cccc:bbbb::')
+ psid_len = 6
+ print('<vpp xmlns="http://www.cisco.com/yang/cisco-vpp"><softwire><softwire-instances>');
+ count = 1024;
+ for i in range(ip4_pfx.num_addresses):
+ if not i % 64:
+ ip6_src = ip6_src + 1
+ if count == 0:
+ break;
+ count = count - 1;
+ print('<softwire-instance>')
+ print(' <id>'+ str(i)+ '</id>')
+ print(' <ipv4-prefix>'+ str(ip4_pfx[i])+ '/32</ipv4-prefix>')
+ print(' <ipv6-prefix>::/0</ipv6-prefix>')
+ print(' <ea-len>0</ea-len>')
+ print(' <psid-offset>0</psid-offset>')
+ print(' <psid-len>'+ str(psid_len) + '</psid-len>')
+ for psid in range(0x1 << psid_len):
+ print(' <binding>')
+ print(' <psid>', psid, '</psid>')
+ print(' <ipv6-addr>'+ str(ip6_dst[(i * (0x1<<psid_len)) + psid]) + '</ipv6-addr>')
+ print(' </binding>')
+ print('</softwire-instance>')
+ print('</softwire-instances></softwire>')
+ print('</vpp>')
+
+#
+# 1:1 Shared IPv4 address, shared BR
+#
+def shared11br():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-shared-src cccc:bbbb::1",
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Shared IPv4 address, shared BR
+#
+def shared11br():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-shared-src cccc:bbbb::1",
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+
+#
+# 1:1 Shared IPv4 address
+#
+def shared11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Shared IPv4 address small
+#
+def smallshared11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/24')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Full IPv4 address
+#
+def full11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 0
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx " + str(ip6_dst[i]) + "/128 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 0 psid-len 0")
+def full11br():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 0
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx " + str(ip6_dst[i]) + "/128 ip6-shared-src cccc:bbbb::1",
+ "ea-bits-len 0 psid-offset 0 psid-len 0")
+
+#
+# Algorithmic mapping Shared IPv4 address
+#
+def algo():
+ print("map add domain ip4-pfx 20.0.0.0/24 ip6-pfx bbbb::/32 ip6-src cccc:bbbb::1 ea-bits-len 16 psid-offset 6 psid-len 8")
+ print("map add domain ip4-pfx 20.0.1.0/24 ip6-pfx bbbb:1::/32 ip6-src cccc:bbbb::2 ea-bits-len 8 psid-offset 0 psid-len 0")
+
+#
+# IP4 forwarding
+#
+def ip4():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ for i in range(ip4_pfx.num_addresses):
+ print("ip route add " + str(ip4_pfx[i]) + "/32 via 172.16.0.2")
+
+
+globals()[args.mapmode]()
+
+
diff --git a/vnet/vnet/map/examples/map-test.py b/vnet/vnet/map/examples/map-test.py
new file mode 100755
index 00000000000..01f377fb6ee
--- /dev/null
+++ b/vnet/vnet/map/examples/map-test.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+# Copyright (c) 2015 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys, time
+from scapy.all import *
+
+import mapalgs
+
+
+ifname = "vpp-tap"
+
+loc_v4_mac = "aa:aa:aa:aa:aa:a4"
+loc_v6_mac = "aa:aa:aa:aa:aa:a6"
+vpp_mac = "aa:aa:aa:aa:00:00"
+
+map_t = 1
+
+fragsize = 0
+map_mtu = 200
+
+def mac_to_vppmac(mac):
+ mac = mac.replace(':', '')
+ return mac[0:4]+"."+mac[4:8]+"."+mac[8:12]
+
+
+map = mapalgs.MapCalc( rulev6 = 'bbbb::/32',
+ rulev4 = '20.0.0.0/24',
+ ratio = 256);
+
+dmr = mapalgs.DmrCalc('cccc:bbbb::/96')
+
+
+ICMP_TYPES_CODES = {
+ 0: 0,
+ 3: 15,
+ 4: 0,
+ 5: 3,
+ 6: 0,
+ 8: 0,
+ 9: 0,
+ 10: 0,
+ 11: 1,
+ 12: 2,
+ 13: 0,
+ 14: 0,
+ 15: 0,
+ 16: 0,
+ 17: 0,
+ 18: 0
+}
+
+ICMP6_TYPES_CODES = {
+ 1: 7,
+ 2: 0,
+ 3: 1,
+ 4: 3,
+}
+
+def net_conf():
+ c = ""
+ c += "tap connect "+ifname+" hwaddr "+mac_to_vppmac(vpp_mac)+" \n"
+ c += "set int state tap-0 up \n"
+ c += "set ip6 neighbor tap-0 2001:f00d::1 "+mac_to_vppmac(loc_v6_mac)+" \n"
+ c += "set ip arp tap-0 10.0.0.1 "+mac_to_vppmac(loc_v4_mac)+" \n"
+ c += "ip route add ::/0 via 2001:f00d::1 tap-0 \n"
+ c += "ip route add 0.0.0.0/0 via 10.0.0.1 tap-0 \n"
+ return c
+
+def conf():
+ c = net_conf()
+ c += "map add domain ip4-pfx 20.0.0.0/24 ip6-pfx bbbb::/32 ea-bits-len 16 psid-offset 6 psid-len 8"
+ if map_mtu != 0:
+ c += " mtu "+str(map_mtu)
+ if map_t:
+ c += " ip6-src cccc:bbbb::/96 map-t"
+ else:
+ c += " ip6-src cccc:bbbb::ffff"
+
+ c += "\n"
+ return c
+
+def send_packet(ip_header, ip_content):
+ print("Send packet")
+ if fragsize != 0:
+ if ip_header.version == 4:
+ frags = fragment(ip_header/ip_content, fragsize=fragsize)
+ for f in frags:
+ print("Fragmented IPv4 packet")
+ sendp(Ether(dst=vpp_mac, src=loc_v4_mac)/f, iface=ifname)
+ elif ip_header.version == 6:
+ frags = fragment6(ip_header/IPv6ExtHdrFragment()/ip_content, fragsize)
+ for f in frags:
+ print("Fragmented IPv6 packet")
+ sendp(Ether(dst=vpp_mac, src=loc_v6_mac)/f, iface=ifname)
+ else:
+ sendp(Ether(dst=vpp_mac)/ip_header/ip_content, iface=ifname)
+
+def send_packet_frag_inner(packet, inner_header, inner_content):
+ print("Send packet with inner ICMP packet")
+ if fragsize != 0:
+ if packet.version == 4:
+ frags = fragment(inner_header/inner_content, fragsize=fragsize)
+ for f in frags:
+ print("Fragmented IPv4 inner packet")
+ sendp(Ether(dst=vpp_mac, src=loc_v4_mac)/packet/f, iface=ifname)
+ elif packet.version == 6:
+ frags = fragment6(inner_header/IPv6ExtHdrFragment()/inner_content, fragsize)
+ for f in frags:
+ print("Fragmented IPv6 inner packet")
+ sendp(Ether(dst=vpp_mac, src=loc_v6_mac)/packet/f, iface=ifname)
+ else:
+ sendp(Ether(dst=vpp_mac)/packet/inner_header/inner_content, iface=ifname)
+
+
+def sendv6udp(src, dst, port):
+ psid = map.gen_psid(port)
+ ceaddr = str(map.get_mapce_addr(src, psid))
+ dst = str(dmr.embed_6052addr(dst))
+ send_packet(IPv6(dst=dst, src=ceaddr), UDP(sport=port)/('X'*900))
+
+def sendv6tcp(src, dst, port):
+ psid = map.gen_psid(port)
+ ceaddr = str(map.get_mapce_addr(src, psid))
+ dst = str(dmr.embed_6052addr(dst))
+ send_packet(IPv6(dst=dst, src=ceaddr), TCP(sport=port)/('X'*900))
+
+def sendv4udp(src, dst, port):
+ send_packet(IP(dst=dst, src=src), UDP(dport=port)/('X'*900))
+
+def sendv4tcp(src, dst, port):
+ send_packet(IP(dst=dst, src=src), TCP(dport=port)/('X'*900))
+
+def sendv6ping(src, dst, id):
+ psid = map.gen_psid(id)
+ ceaddr = str(map.get_mapce_addr(src, psid))
+ dst = str(dmr.embed_6052addr(dst))
+ send_packet(IPv6(dst=dst, src=ceaddr), ICMPv6EchoRequest(id=id, data='A'*500))
+ send_packet(IPv6(dst=dst, src=ceaddr), ICMPv6EchoReply(id=id, data='A'*500))
+
+def sendv4ping(src, dst, id):
+ send_packet(IP(dst=dst, src=src), ICMP(id=id, type=0)/('X'*500))
+ send_packet(IP(dst=dst, src=src), ICMP(id=id, type=8)/('X'*500))
+
+def sendv4icmperr(src, dst, type, code, port, inner_src, inner_dst, payload_length):
+ inner = IP(dst=inner_dst, src=inner_src)/TCP(sport=port, dport=8888)/('X'*payload_length)
+ send_packet_frag_inner(IP(dst=dst, src=src)/ICMP(type=type, code=code), IP(dst=inner_dst, src=inner_src), TCP(sport=port, dport=8888)/('X'*payload_length))
+ #send_packet(IP(dst=dst, src=src)/ICMP(type=type, code=code)/inner)
+
+def sendv6icmperr(src, dst, type, code, port, payload_length):
+ psid = map.gen_psid(port)
+ src = str(map.get_mapce_addr(src, psid))
+ dst = str(dmr.embed_6052addr(dst))
+ inner_header = IPv6(dst=src, src=dst)
+ inner_content = TCP(sport=8888, dport=port)/('X'*payload_length)
+ send_packet_frag_inner(IPv6(dst=dst, src=src)/ICMPv6DestUnreach(type=type, code=code), inner_header, inner_content)
+ #send_packet(IPv6(dst=dst, src=src)/ICMPv6DestUnreach(type=type, code=code)/inner)
+
+def sendv4icmp_errors(src, dst, port, inner_src, inner_dst, payload_length):
+ for type in ICMP_TYPES_CODES:
+ for code in range(0, ICMP_TYPES_CODES[type] + 1):
+ sendv4icmperr(src, dst, type, code, port, inner_src, inner_dst, payload_length)
+ #sendv4icmperr(src, dst, type, ICMP_TYPES_CODES[type] + 2, port, inner_src, inner_dst, payload_length)
+ #sendv4icmperr(src, dst, type, 255, port, inner_src, inner_dst, payload_length)
+ #sendv4icmperr(src, dst, 1, 0, port, inner_src, inner_dst, payload_length)
+ #sendv4icmperr(src, dst, 2, 10, port, inner_src, inner_dst, payload_length)
+ #sendv4icmperr(src, dst, 255, 255, port, inner_src, inner_dst, payload_length)
+
+ #TODO: Check wrong paramater with different pointer values
+
+def sendv6icmp_errors(src, dst, port, payload_length):
+ for type in ICMP6_TYPES_CODES:
+ for code in range(0, ICMP6_TYPES_CODES[type] + 1):
+ sendv6icmperr(src, dst, type, code, port, payload_length)
+ #sendv6icmperr(src, dst, type, ICMP6_TYPES_CODES[type] + 2, port, payload_length)
+ #sendv6icmperr(src, dst, type, 255, port, payload_length)
+
+
+def traffic():
+ delay = 2.0
+ while 1:
+ #sendp(Ether(dst="bb:bb:bb:bb:bb:b4")/IP(dst="20.0.0.1")/UDP(chksum=0)/('X'*900), iface="vpp-tapv4")
+ #sendp(Ether(dst="bb:bb:bb:bb:bb:b6")/IPv6(dst="cccc:bbbb::a000:0001")/ICMPv6EchoRequest()/('X'*900), iface="vpp-tapv6")
+ #sendp(Ether(dst="bb:bb:bb:bb:bb:b6")/IPv6(dst="cccc:bbbb::a000:0001")/UDP()/('X'*900), iface="vpp-tapv6")
+ sendv6udp("20.0.0.1", "10.0.0.1", 12001)
+ sendv6tcp("20.0.0.1", "10.0.0.1", 12002)
+ sendv4udp("10.0.0.1", "20.0.0.1", 12003)
+ sendv4tcp("10.0.0.1", "20.0.0.1", 12004)
+ sendv6ping("20.0.0.1", "10.0.0.1", 12005)
+ sendv4ping("10.0.0.1", "20.0.0.1", 12006)
+ sendv4icmp_errors("10.0.0.1", "20.0.0.1", 12006, "20.0.0.1", "10.0.0.1", 500)
+ sendv4icmp_errors("10.0.0.1", "20.0.0.1", 12006, "20.0.0.1", "10.0.0.1", 1500)
+ sendv6icmp_errors("20.0.0.1", "10.0.0.1", 12006, 500)
+ time.sleep(delay)
+ delay *= 0.9
+
+if len(sys.argv) <= 1:
+ print("Usage: conf|traffic")
+ exit(1)
+
+if sys.argv[1] == "conf":
+ print(conf())
+elif sys.argv[1] == "traffic":
+ traffic() \ No newline at end of file
diff --git a/vnet/vnet/map/examples/mapalgs.py b/vnet/vnet/map/examples/mapalgs.py
new file mode 100644
index 00000000000..50a0ed0a3ee
--- /dev/null
+++ b/vnet/vnet/map/examples/mapalgs.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+
+# The MIT License (MIT)
+#
+# Copyright (c) 2015
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# File included from https://github.com/ejordangottlieb/pyswmap
+# Thanks to jordan ;)
+# - Pierre
+#
+
+# There is still a great deal of work required on this module. Please
+# use with caution.
+# -Jordan
+
+import sys
+from ipaddress import (
+ IPv6Address,
+ IPv6Network,
+ ip_network,
+ ip_address,
+ )
+from math import (
+ log,
+ )
+
+class MapCalc(object):
+
+ def __init__(self,**bmr):
+ #rulev6,rulev4):
+ self.portranges = False
+
+ # Validate and set BMR and BMR derived values
+ self._check_bmr_values(bmr)
+
+ def _check_bmr_values(self,bmr):
+ # Assume these values have not been supplied. Validate later.
+ self.ealen = False
+ self.ratio = False
+
+ # Validate that a proper PSID Offset has been set
+ if 'psidoffset' not in bmr:
+ # Set Default PSID Offset of 6 if it is not set
+ self.psidoffset = 6
+ else:
+ self.psidoffset = self._psid_offset(bmr['psidoffset'])
+
+ # Validate that a proper IPv4 rule prefix is defined
+ if 'rulev4' not in bmr:
+ print("The rule IPv4 prefix has not been set")
+ sys.exit(1)
+ else:
+ self.rulev4 = self._ipv4_rule(bmr['rulev4'])
+
+ # Validate that a proper IPv6 rule prefix is defined
+ if 'rulev6' not in bmr:
+ print("The rule IPv6 prefix has not been set")
+ sys.exit(1)
+ else:
+ self.rulev6 = self._ipv6_rule(bmr['rulev6'])
+
+ # Check if EA length was passed
+ if 'ealen' not in bmr:
+ self.ealen = False
+ else:
+ self.ealen = bmr['ealen']
+ self.ratio = self._calc_ratio(bmr['ealen'])
+
+ # Check if sharing ratio was passed or calculated by _calc_ratio
+ if 'ratio' not in bmr:
+ # Skip if we have already calculated ratio
+ if not (self.ratio):
+ self.ratio = False
+ else:
+ if (self.ealen):
+ # Check to see if supplied EA length contradicts supplied ratio
+ if ( bmr['ratio'] != self.ratio ):
+ eavalue = "EA value {}".format(self.ealen)
+ sharingratio = "sharing ratio {}".format(bmr['ratio'])
+ print("Supplied {} and {} are contradictory".format(
+ eavalue,
+ sharingratio)
+ )
+ sys.exit(1)
+ else:
+ self.ratio = bmr['ratio']
+ self.ealen = self._calc_ea(bmr['ratio'])
+
+ # EA length or sharing ratio must be set
+ if not ( self.ealen or self.ratio):
+ print("The BMR must include an EA length or sharing ratio")
+ sys.exit(1)
+
+ # Since we have not hit an exception we can calculate the port bits
+ self.portbits = self._calc_port_bits()
+
+ def _ipv4_rule(self,rulev4):
+ try:
+ self.rulev4mask = ip_network(
+ rulev4,
+ strict=False
+ ).prefixlen
+ except ValueError:
+ print("Invalid IPv4 prefix {}".format(rulev4))
+ sys.exit(1)
+
+ self.rulev4object = ip_network(rulev4)
+
+ return rulev4
+
+ def _ipv6_rule(self,rulev6):
+ try:
+ self.rulev6mask = IPv6Network(
+ rulev6,
+ strict=False
+ ).prefixlen
+ except ValueError:
+ print("Invalid IPv6 prefix {}".format(rulev6))
+ sys.exit(1)
+
+ return rulev6
+
+ def _psid_offset(self,psidoffset):
+ PSIDOFFSET_MAX = 6
+ if psidoffset in range(0,PSIDOFFSET_MAX+1):
+ return psidoffset
+ else:
+ print("Invalid PSID Offset value: {}".format(psidoffset))
+ sys.exit(1)
+
+ def _psid_range(self,x):
+ rset = []
+ for i in range(0,x+1):
+ rset.append(2**i)
+ return rset
+
+ def _calc_port_bits(self):
+ portbits = 16 - self.psidoffset - self.psidbits
+ return portbits
+
+ def _calc_ea(self,ratio):
+ if ratio not in ( self._psid_range(16) ):
+ print("Invalid ratio {}".format(ratio))
+ print("Ratio between 2 to the power of 0 thru 16")
+ sys.exit(1)
+
+ if ( 1 == ratio):
+ self.psidbits = 0
+ else:
+ self.psidbits = int(log(ratio,2))
+ ealen = self.psidbits + ( 32 - self.rulev4mask )
+ return ealen
+
+ def _calc_ratio(self,ealen):
+ maskbits = 32 - self.rulev4mask
+ if ( ealen < maskbits ):
+ print("EA of {} incompatible with rule IPv4 prefix {}".format(
+ ealen,
+ self.rulev4,
+ )
+ )
+ print("EA length must be at least {} bits".format(
+ maskbits,
+ )
+ )
+ sys.exit(1)
+
+ self.psidbits = ealen - ( 32 - self.rulev4mask )
+ if ( self.psidbits > 16):
+ print("EA length of {} is too large".format(
+ ealen,
+ )
+ )
+ print("EA should not exceed {} for rule IPv4 prefix {}".format(
+ maskbits + 16,
+ self.rulev4,
+ )
+ )
+ sys.exit(1)
+ ratio = 2**self.psidbits
+ return ratio
+
+ def gen_psid(self,portnum):
+ if ( portnum < self.start_port() ):
+ print("port value is less than allowed by PSID Offset")
+ sys.exit(1)
+ psid = (portnum & ((2**self.psidbits - 1) << self.portbits))
+ psid = psid >> self.portbits
+ return psid
+
+ def port_ranges(self):
+ return 2**self.psidoffset - 1
+
+ def start_port(self):
+ if self.psidoffset == 0: return 0
+ return 2**(16 - self.psidoffset)
+
+ def port_list(self,psid):
+ startrange = psid * (2**self.portbits) + self.start_port()
+ increment = (2**self.psidbits) * (2**self.portbits)
+ portlist = [ ]
+ for port in range(startrange,startrange + 2**self.portbits):
+ if port >= 65536: continue
+ portlist.append(port)
+ for x in range(1,self.port_ranges()):
+ startrange += increment
+ for port in range(startrange,startrange + 2**self.portbits):
+ portlist.append(port)
+ return portlist
+
+ def ipv4_index(self,ipv4addr):
+ if ip_address(ipv4addr) in ip_network(self.rulev4):
+ x = ip_address(ipv4addr)
+ y = ip_network(self.rulev4,strict=False).network_address
+ self.ipv4addr = x
+ return ( int(x) - int(y) )
+ else:
+ print("Error: IPv4 address {} not in Rule IPv4 subnet {}".format(
+ ipv4add,
+ ip_network(self.rulev4,strict=False).network_address))
+ sys.exit(1)
+
+ def _calc_ipv6bit_pos(self):
+ addroffset = 128 - (self.rulev6mask + ( self.ealen - self.psidbits))
+ psidshift = 128 - ( self.rulev6mask + self.ealen )
+ return [addroffset,psidshift]
+
+ def _append_map_eabits(self,ipv4index,addroffset,psidshift,psid):
+ rulev6base = IPv6Network(self.rulev6,strict=False).network_address
+ map_prefix = int(rulev6base) | ( ipv4index << addroffset )
+ map_fullprefix = map_prefix | ( psid << psidshift)
+ return map_fullprefix
+
+
+ def get_mapce_addr(self,ipv4addr,psid):
+ ipv4index = self.ipv4_index(ipv4addr)
+ (addroffset,psidshift) = self._calc_ipv6bit_pos()
+ map_fullprefix = self._append_map_eabits(ipv4index,
+ addroffset,
+ psidshift,
+ psid)
+ mapv4iid = map_fullprefix | ( int(self.ipv4addr) << 16 )
+ map_full_address = mapv4iid | psid
+ mapce_address = "{}".format(IPv6Address(map_full_address))
+ return mapce_address
+
+ def get_mapce_prefix(self,ipv4addr,psid):
+ ipv4index = self.ipv4_index(ipv4addr)
+ (addroffset,psidshift) = self._calc_ipv6bit_pos()
+ map_fullprefix = self._append_map_eabits(ipv4index,
+ addroffset,
+ psidshift,
+ psid)
+ mapce_prefix = "{}/{}".format(
+ IPv6Address(map_fullprefix),
+ self.rulev6mask + self.ealen
+ )
+ return mapce_prefix
+
+ def get_map_ipv4(self,mapce_address):
+ ipv4 = (int(IPv6Address(mapce_address)) & ( 0xffffffff << 16 )) >> 16
+ return ip_address(ipv4)
+
+
+
+class DmrCalc(object):
+
+ def __init__(self,dmr):
+
+ # Validate and set BMR and BMR derived values
+ self.dmrprefix = self._check_dmr_prefix(dmr)
+
+ def embed_6052addr(self,ipv4addr):
+
+ try:
+ ipv4addrint = int(ip_address(ipv4addr))
+ except ValueError:
+ print("Invalid IPv4 address {}".format(ipv4addr))
+ sys.exit(1)
+
+ if ( self.dmrprefix.prefixlen == 64 ):
+ ipv6int = ipv4addrint << 24
+ ipv6int += int(self.dmrprefix.network_address)
+ return IPv6Address(ipv6int)
+
+ if ( self.dmrprefix.prefixlen == 96 ):
+ ipv6int = ipv4addrint
+ ipv6int += int(self.dmrprefix.network_address)
+ return IPv6Address(ipv6int)
+
+ def _check_dmr_prefix(self,dmrprefix):
+ try:
+ self.dmrmask = IPv6Network(
+ dmrprefix,
+ strict=False
+ ).prefixlen
+ except ValueError:
+ print("Invalid IPv6 prefix {}".format(prefix))
+ sys.exit(1)
+
+ if self.dmrmask not in (32,40,48,56,64,96):
+ print("Invalid prefix mask /{}".format(self.dmrmask))
+ sys.exit(1)
+
+ return IPv6Network(dmrprefix)
+
+if __name__ == "__main__":
+ m = DmrCalc('fd80::/48')
+ print(m.dmrprefix)
diff --git a/vnet/vnet/map/examples/mt-test.py b/vnet/vnet/map/examples/mt-test.py
new file mode 100644
index 00000000000..62d269c7a13
--- /dev/null
+++ b/vnet/vnet/map/examples/mt-test.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2009-2014 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+import time
+from scapy.all import *
+from Queue import *
+
+iface = 'veth1'
+
+class SnifferThread(threading.Thread) :
+ def __init__(self,q,iface,flt,timeout) :
+ threading.Thread.__init__(self)
+ self.q = q
+ self.iface = iface
+ self.timeout = timeout
+ self.flt = flt
+ print("Sniffers reporting for service on ",self.iface)
+
+ def run(self) :
+ conf.iface=self.iface
+ conf.iface6=self.iface
+
+ r = sniff(filter=self.flt,iface=self.iface,timeout=self.timeout,prn=lambda x: x.summary())
+ self.q.put(r)
+
+
+
+# New "SR" function
+# Fire off thread with filter and expected answer packet(s).
+# Fire off sniffer thread, main thread sends packet
+# Returns true if found
+
+def sr2(answer, *args, **kwargs):
+ q = Queue()
+ print("Creating SnifferThreadWorkerThread")
+ flt='ip proto 41'
+ iface='veth1'
+ sniffer = SnifferThread(q,iface,flt,1)
+ sniffer.setDaemon(True)
+ sniffer.start()
+
+ print "Sending packet:"
+ send(*args, **kwargs)
+ sniffer.join()
+ ps = q.get()
+
+# ps.summary()
+ print "Number of packets sniffed:", len(ps)
+
+ for p in ps:
+ ip = p.getlayer(1)
+ print "Comparing", ip.summary(), "and", answer.summary()
+ if ip == answer:
+ print "We have a match!!"
+ return True
+ return False
+
+aip6 = IPv6(dst='2002:0a0a:0a0a::12')/ICMPv6EchoRequest()
+answer= IP(src="10.0.0.100",dst="10.10.10.10",ttl=63)/aip6
+packet = IPv6(dst='2002:0a0a:0a0a::12')/ICMPv6EchoRequest()
+
+# From IPv6
+sr2(answer, packet,iface='veth1')
+
+#From IPv4
+packet = IP(src='10.10.10.10',dst='10.0.0.100')/IPv6(src='2002:0a0a:0a0a::12',dst='1::2')/ICMPv6EchoRequest()
+sr2(answer, packet,iface='veth1')
diff --git a/vnet/vnet/map/gen-rules.py b/vnet/vnet/map/gen-rules.py
new file mode 100755
index 00000000000..533a8e237f7
--- /dev/null
+++ b/vnet/vnet/map/gen-rules.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2015 Cisco and/or its affiliates.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ipaddress
+import argparse
+import sys
+
+# map add domain ip4-pfx <pfx> ip6-pfx ::/0 ip6-src <ip6-src> ea-bits-len 0 psid-offset 6 psid-len 6
+# map add rule index <0> psid <psid> ip6-dst <ip6-dst>
+
+parser = argparse.ArgumentParser(description='MAP VPP configuration generator')
+parser.add_argument('-t', action="store", dest="mapmode")
+args = parser.parse_args()
+
+#
+# 1:1 Shared IPv4 address, shared BR
+#
+def shared11br():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-shared-src cccc:bbbb::1",
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+
+#
+# 1:1 Shared IPv4 address
+#
+def shared11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Shared IPv4 address small
+#
+def smallshared11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/24')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 6
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx ::/0 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 6 psid-len", psid_len)
+ for psid in range(0x1 << psid_len):
+ print("map add rule index", i, "psid", psid, "ip6-dst", ip6_dst[(i * (0x1<<psid_len)) + psid])
+
+#
+# 1:1 Full IPv4 address
+#
+def full11():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_src = ipaddress.ip_network('cccc:bbbb::/64')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 0
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx " + str(ip6_dst[i]) + "/128 ip6-src", ip6_src[i],
+ "ea-bits-len 0 psid-offset 0 psid-len 0")
+def full11br():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ ip6_dst = ipaddress.ip_network('bbbb::/32')
+ psid_len = 0
+ for i in range(ip4_pfx.num_addresses):
+ print("map add domain ip4-pfx " + str(ip4_pfx[i]) + "/32 ip6-pfx " + str(ip6_dst[i]) + "/128 ip6-shared-src cccc:bbbb::1",
+ "ea-bits-len 0 psid-offset 0 psid-len 0")
+
+#
+# Algorithmic mapping Shared IPv4 address
+#
+def algo():
+ print("map add domain ip4-pfx 20.0.0.0/24 ip6-pfx bbbb::/32 ip6-src cccc:bbbb::1 ea-bits-len 16 psid-offset 6 psid-len 8")
+ print("map add domain ip4-pfx 20.0.1.0/24 ip6-pfx bbbb:1::/32 ip6-src cccc:bbbb::2 ea-bits-len 8 psid-offset 0 psid-len 0")
+
+#
+# IP4 forwarding
+#
+def ip4():
+ ip4_pfx = ipaddress.ip_network('20.0.0.0/16')
+ for i in range(ip4_pfx.num_addresses):
+ print("ip route add " + str(ip4_pfx[i]) + "/32 via 172.16.0.2")
+
+
+globals()[args.mapmode]()
+
+
diff --git a/vnet/vnet/map/ip4_map.c b/vnet/vnet/map/ip4_map.c
new file mode 100644
index 00000000000..cf53ef4918c
--- /dev/null
+++ b/vnet/vnet/map/ip4_map.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Defines used for testing various optimisation schemes
+ */
+#define MAP_ENCAP_DUAL 0
+
+#include "map.h"
+#include "../ip/ip_frag.h"
+
+vlib_node_registration_t ip4_map_reass_node;
+
+enum ip4_map_next_e {
+ IP4_MAP_NEXT_IP6_LOOKUP,
+#ifdef MAP_SKIP_IP6_LOOKUP
+ IP4_MAP_NEXT_IP6_REWRITE,
+#endif
+ IP4_MAP_NEXT_FRAGMENT,
+ IP4_MAP_NEXT_REASS,
+ IP4_MAP_NEXT_DROP,
+ IP4_MAP_N_NEXT,
+};
+
+enum ip4_map_reass_next_t {
+ IP4_MAP_REASS_NEXT_IP6_LOOKUP,
+ IP4_MAP_REASS_NEXT_IP4_FRAGMENT,
+ IP4_MAP_REASS_NEXT_DROP,
+ IP4_MAP_REASS_N_NEXT,
+};
+
+typedef struct {
+ u32 map_domain_index;
+ u16 port;
+ u8 cached;
+} map_ip4_map_reass_trace_t;
+
+u8 *
+format_ip4_map_reass_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *);
+ map_ip4_map_reass_trace_t *t = va_arg (*args, map_ip4_map_reass_trace_t *);
+ return format(s, "MAP domain index: %d L4 port: %u Status: %s", t->map_domain_index,
+ t->port, t->cached?"cached":"forwarded");
+}
+
+/*
+ * ip4_map_get_port
+ */
+u16
+ip4_map_get_port (ip4_header_t *ip, map_dir_e dir)
+{
+ /* Find port information */
+ if (PREDICT_TRUE((ip->protocol == IP_PROTOCOL_TCP) ||
+ (ip->protocol == IP_PROTOCOL_UDP))) {
+ udp_header_t *udp = (void *)(ip + 1);
+ return (dir == MAP_SENDER ? udp->src_port : udp->dst_port);
+ } else if (ip->protocol == IP_PROTOCOL_ICMP) {
+ /*
+ * 1) ICMP Echo request or Echo reply
+ * 2) ICMP Error with inner packet being UDP or TCP
+ * 3) ICMP Error with inner packet being ICMP Echo request or Echo reply
+ */
+ icmp46_header_t *icmp = (void *)(ip + 1);
+ if (icmp->type == ICMP4_echo_request || icmp->type == ICMP4_echo_reply) {
+ return *((u16 *)(icmp + 1));
+ } else if (clib_net_to_host_u16(ip->length) >= 64) { // IP + ICMP + IP + L4 header
+ ip4_header_t *icmp_ip = (ip4_header_t *)(icmp + 2);
+ if (PREDICT_TRUE((icmp_ip->protocol == IP_PROTOCOL_TCP) ||
+ (icmp_ip->protocol == IP_PROTOCOL_UDP))) {
+ udp_header_t *udp = (void *)(icmp_ip + 1);
+ return (dir == MAP_SENDER ? udp->dst_port : udp->src_port);
+ } else if (icmp_ip->protocol == IP_PROTOCOL_ICMP) {
+ icmp46_header_t *inner_icmp = (void *)(icmp_ip + 1);
+ if (inner_icmp->type == ICMP4_echo_request || inner_icmp->type == ICMP4_echo_reply)
+ return (*((u16 *)(inner_icmp + 1)));
+ }
+ }
+ }
+ return (0);
+}
+
+static_always_inline u16
+ip4_map_port_and_security_check (map_domain_t *d, ip4_header_t *ip, u32 *next, u8 *error)
+{
+ u16 port = 0;
+
+ if (d->psid_length > 0) {
+ if (!ip4_is_fragment(ip)) {
+ if (PREDICT_FALSE((ip->ip_version_and_header_length != 0x45) || clib_host_to_net_u16(ip->length) < 28)) {
+ return 0;
+ }
+ port = ip4_map_get_port(ip, MAP_RECEIVER);
+ if (port) {
+ /* Verify that port is not among the well-known ports */
+ if ((d->psid_offset > 0) && (clib_net_to_host_u16(port) < (0x1 << (16 - d->psid_offset)))) {
+ *error = MAP_ERROR_ENCAP_SEC_CHECK;
+ } else {
+ return (port);
+ }
+ } else {
+ *error = MAP_ERROR_BAD_PROTOCOL;
+ }
+ } else {
+ *next = IP4_MAP_NEXT_REASS;
+ }
+ }
+ return (0);
+}
+
+/*
+ * ip4_map_vtcfl
+ */
+static_always_inline u32
+ip4_map_vtcfl (ip4_header_t *ip4, vlib_buffer_t *p)
+{
+ map_main_t *mm = &map_main;
+ u8 tc = mm->tc_copy ? ip4->tos : mm->tc;
+ u32 vtcfl = 0x6 << 28;
+ vtcfl |= tc << 20;
+ vtcfl |= vnet_buffer(p)->ip.flow_hash && 0x000fffff;
+
+ return (clib_host_to_net_u32(vtcfl));
+}
+
+static_always_inline bool
+ip4_map_ip6_lookup_bypass (vlib_buffer_t *p0, ip4_header_t *ip)
+{
+#ifdef MAP_SKIP_IP6_LOOKUP
+ map_main_t *mm = &map_main;
+ u32 adj_index0 = mm->adj6_index;
+ if (adj_index0 > 0) {
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm6, mm->adj6_index);
+ if (adj->n_adj > 1) {
+ u32 hash_c0 = ip4_compute_flow_hash(ip, IP_FLOW_HASH_DEFAULT);
+ adj_index0 += (hash_c0 & (adj->n_adj - 1));
+ }
+ vnet_buffer(p0)->ip.adj_index[VLIB_TX] = adj_index0;
+ return (true);
+ }
+#endif
+ return (false);
+}
+
+/*
+ * ip4_map
+ */
+static uword
+ip4_map (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_map_node.index);
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ map_main_t *mm = &map_main;
+ vlib_combined_counter_main_t *cm = mm->domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Dual loop */
+ while (n_left_from > 4 && n_left_to_next > 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ map_domain_t *d0, *d1;
+ u8 error0 = MAP_ERROR_NONE, error1 = MAP_ERROR_NONE;
+ ip4_header_t *ip40, *ip41;
+ u16 port0 = 0, port1 = 0;
+ ip6_header_t *ip6h0, *ip6h1;
+ u32 map_domain_index0 = ~0, map_domain_index1 = ~0;
+ u32 next0 = IP4_MAP_NEXT_IP6_LOOKUP, next1 = IP4_MAP_NEXT_IP6_LOOKUP;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t *p2, *p3;
+
+ p2 = vlib_get_buffer(vm, from[2]);
+ p3 = vlib_get_buffer(vm, from[3]);
+
+ vlib_prefetch_buffer_header(p2, STORE);
+ vlib_prefetch_buffer_header(p3, STORE);
+ /* IPv4 + 8 = 28. possibly plus -40 */
+ CLIB_PREFETCH (p2->data-40, 68, STORE);
+ CLIB_PREFETCH (p3->data-40, 68, STORE);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next +=2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip40 = vlib_buffer_get_current(p0);
+ ip41 = vlib_buffer_get_current(p1);
+ p0->current_length = clib_net_to_host_u16(ip40->length);
+ p1->current_length = clib_net_to_host_u16(ip41->length);
+ d0 = ip4_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], &map_domain_index0);
+ d1 = ip4_map_get_domain(vnet_buffer(p1)->ip.adj_index[VLIB_TX], &map_domain_index1);
+ ASSERT(d0);
+ ASSERT(d1);
+
+ /*
+ * Shared IPv4 address
+ */
+ port0 = ip4_map_port_and_security_check(d0, ip40, &next0, &error0);
+ port1 = ip4_map_port_and_security_check(d1, ip41, &next1, &error1);
+
+ /* MAP calc */
+ u32 da40 = clib_net_to_host_u32(ip40->dst_address.as_u32);
+ u32 da41 = clib_net_to_host_u32(ip41->dst_address.as_u32);
+ u16 dp40 = clib_net_to_host_u16(port0);
+ u16 dp41 = clib_net_to_host_u16(port1);
+ u64 dal60 = map_get_pfx(d0, da40, dp40);
+ u64 dal61 = map_get_pfx(d1, da41, dp41);
+ u64 dar60 = map_get_sfx(d0, da40, dp40);
+ u64 dar61 = map_get_sfx(d1, da41, dp41);
+ if (dal60 == 0 && dar60 == 0) error0 = MAP_ERROR_UNKNOWN;
+ if (dal61 == 0 && dar61 == 0) error1 = MAP_ERROR_UNKNOWN;
+
+ /* construct ipv6 header */
+ vlib_buffer_advance(p0, - sizeof(ip6_header_t));
+ vlib_buffer_advance(p1, - sizeof(ip6_header_t));
+ ip6h0 = vlib_buffer_get_current(p0);
+ ip6h1 = vlib_buffer_get_current(p1);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] = (u32)~0;
+ vnet_buffer(p1)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ ip6h0->ip_version_traffic_class_and_flow_label = ip4_map_vtcfl(ip40, p0);
+ ip6h1->ip_version_traffic_class_and_flow_label = ip4_map_vtcfl(ip41, p1);
+ ip6h0->payload_length = ip40->length;
+ ip6h1->payload_length = ip41->length;
+ ip6h0->protocol = IP_PROTOCOL_IP_IN_IP;
+ ip6h1->protocol = IP_PROTOCOL_IP_IN_IP;
+ ip6h0->hop_limit = 0x40;
+ ip6h1->hop_limit = 0x40;
+ ip6h0->src_address = d0->ip6_src;
+ ip6h1->src_address = d1->ip6_src;
+ ip6h0->dst_address.as_u64[0] = clib_host_to_net_u64(dal60);
+ ip6h0->dst_address.as_u64[1] = clib_host_to_net_u64(dar60);
+ ip6h1->dst_address.as_u64[0] = clib_host_to_net_u64(dal61);
+ ip6h1->dst_address.as_u64[1] = clib_host_to_net_u64(dar61);
+
+ /*
+ * Determine next node. Can be one of:
+ * ip6-lookup, ip6-rewrite, ip4-fragment, ip4-virtreass, error-drop
+ */
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
+ if (PREDICT_FALSE(d0->mtu && (clib_net_to_host_u16(ip6h0->payload_length) + sizeof(*ip6h0) > d0->mtu))) {
+ vnet_buffer(p0)->ip_frag.header_offset = sizeof(*ip6h0);
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ vnet_buffer(p0)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
+ next0 = IP4_MAP_NEXT_FRAGMENT;
+ } else {
+ next0 = ip4_map_ip6_lookup_bypass(p0, ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0;
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip6h0->payload_length) + 40);
+ }
+ } else {
+ next0 = IP4_MAP_NEXT_DROP;
+ }
+
+ /*
+ * Determine next node. Can be one of:
+ * ip6-lookup, ip6-rewrite, ip4-fragment, ip4-virtreass, error-drop
+ */
+ if (PREDICT_TRUE(error1 == MAP_ERROR_NONE)) {
+ if (PREDICT_FALSE(d1->mtu && (clib_net_to_host_u16(ip6h1->payload_length) + sizeof(*ip6h1) > d1->mtu))) {
+ vnet_buffer(p1)->ip_frag.header_offset = sizeof(*ip6h1);
+ vnet_buffer(p1)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
+ vnet_buffer(p1)->ip_frag.mtu = d1->mtu;
+ vnet_buffer(p1)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
+ next1 = IP4_MAP_NEXT_FRAGMENT;
+ } else {
+ next1 = ip4_map_ip6_lookup_bypass(p1, ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1;
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index1, 1,
+ clib_net_to_host_u16(ip6h1->payload_length) + 40);
+ }
+ } else {
+ next1 = IP4_MAP_NEXT_DROP;
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = port0;
+ }
+ if (PREDICT_FALSE(p1->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p1, sizeof(*tr));
+ tr->map_domain_index = map_domain_index1;
+ tr->port = port1;
+ }
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, n_left_to_next, pi0, pi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ map_domain_t *d0;
+ u8 error0 = MAP_ERROR_NONE;
+ ip4_header_t *ip40;
+ u16 port0 = 0;
+ ip6_header_t *ip6h0;
+ u32 next0 = IP4_MAP_NEXT_IP6_LOOKUP;
+ u32 map_domain_index0 = ~0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip40 = vlib_buffer_get_current(p0);
+ p0->current_length = clib_net_to_host_u16(ip40->length);
+ d0 = ip4_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], &map_domain_index0);
+ ASSERT(d0);
+
+ /*
+ * Shared IPv4 address
+ */
+ port0 = ip4_map_port_and_security_check(d0, ip40, &next0, &error0);
+
+ /* MAP calc */
+ u32 da40 = clib_net_to_host_u32(ip40->dst_address.as_u32);
+ u16 dp40 = clib_net_to_host_u16(port0);
+ u64 dal60 = map_get_pfx(d0, da40, dp40);
+ u64 dar60 = map_get_sfx(d0, da40, dp40);
+ if (dal60 == 0 && dar60 == 0 && error0 == MAP_ERROR_NONE) error0 = MAP_ERROR_UNKNOWN;
+
+ /* construct ipv6 header */
+ vlib_buffer_advance(p0, - (sizeof(ip6_header_t)));
+ ip6h0 = vlib_buffer_get_current(p0);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ ip6h0->ip_version_traffic_class_and_flow_label = ip4_map_vtcfl(ip40, p0);
+ ip6h0->payload_length = ip40->length;
+ ip6h0->protocol = IP_PROTOCOL_IP_IN_IP;
+ ip6h0->hop_limit = 0x40;
+ ip6h0->src_address = d0->ip6_src;
+ ip6h0->dst_address.as_u64[0] = clib_host_to_net_u64(dal60);
+ ip6h0->dst_address.as_u64[1] = clib_host_to_net_u64(dar60);
+
+ /*
+ * Determine next node. Can be one of:
+ * ip6-lookup, ip6-rewrite, ip4-fragment, ip4-virtreass, error-drop
+ */
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
+ if (PREDICT_FALSE(d0->mtu && (clib_net_to_host_u16(ip6h0->payload_length) + sizeof(*ip6h0) > d0->mtu))) {
+ vnet_buffer(p0)->ip_frag.header_offset = sizeof(*ip6h0);
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ vnet_buffer(p0)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
+ next0 = IP4_MAP_NEXT_FRAGMENT;
+ } else {
+ next0 = ip4_map_ip6_lookup_bypass(p0, ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0;
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip6h0->payload_length) + 40);
+ }
+ } else {
+ next0 = IP4_MAP_NEXT_DROP;
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = port0;
+ }
+
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+/*
+ * ip4_map_reass
+ */
+static uword
+ip4_map_reass (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_map_reass_node.index);
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ map_main_t *mm = &map_main;
+ vlib_combined_counter_main_t *cm = mm->domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+ u32 *fragments_to_drop = NULL;
+ u32 *fragments_to_loopback = NULL;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ map_domain_t *d0;
+ u8 error0 = MAP_ERROR_NONE;
+ ip4_header_t *ip40;
+ i32 port0 = 0;
+ ip6_header_t *ip60;
+ u32 next0 = IP4_MAP_REASS_NEXT_IP6_LOOKUP;
+ u32 map_domain_index0;
+ u8 cached = 0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ ip40 = (ip4_header_t *)(ip60 + 1);
+ d0 = ip4_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], &map_domain_index0);
+
+ map_ip4_reass_lock();
+ map_ip4_reass_t *r = map_ip4_reass_get(ip40->src_address.as_u32, ip40->dst_address.as_u32,
+ ip40->fragment_id, ip40->protocol, &fragments_to_drop);
+ if (PREDICT_FALSE(!r)) {
+ // Could not create a caching entry
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ } else if (PREDICT_TRUE(ip4_get_fragment_offset(ip40))) {
+ if (r->port >= 0) {
+ // We know the port already
+ port0 = r->port;
+ } else if (map_ip4_reass_add_fragment(r, pi0)) {
+ // Not enough space for caching
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ map_ip4_reass_free(r, &fragments_to_drop);
+ } else {
+ cached = 1;
+ }
+ } else if ((port0 = ip4_get_port(ip40, MAP_RECEIVER, p0->current_length)) < 0) {
+ // Could not find port. We'll free the reassembly.
+ error0 = MAP_ERROR_BAD_PROTOCOL;
+ port0 = 0;
+ map_ip4_reass_free(r, &fragments_to_drop);
+ } else {
+ r->port = port0;
+ map_ip4_reass_get_fragments(r, &fragments_to_loopback);
+ }
+
+#ifdef MAP_IP4_REASS_COUNT_BYTES
+ if (!cached && r) {
+ r->forwarded += clib_host_to_net_u16(ip40->length) - 20;
+ if (!ip4_get_fragment_more(ip40))
+ r->expected_total = ip4_get_fragment_offset(ip40) * 8 + clib_host_to_net_u16(ip40->length) - 20;
+ if(r->forwarded >= r->expected_total)
+ map_ip4_reass_free(r, &fragments_to_drop);
+ }
+#endif
+
+ map_ip4_reass_unlock();
+
+ // NOTE: Most operations have already been performed by ip4_map
+ // All we need is the right destination address
+ ip60->dst_address.as_u64[0] = map_get_pfx_net(d0, ip40->dst_address.as_u32, port0);
+ ip60->dst_address.as_u64[1] = map_get_sfx_net(d0, ip40->dst_address.as_u32, port0);
+
+ if (PREDICT_FALSE(d0->mtu && (clib_net_to_host_u16(ip60->payload_length) + sizeof(*ip60) > d0->mtu))) {
+ vnet_buffer(p0)->ip_frag.header_offset = sizeof(*ip60);
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP6_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ vnet_buffer(p0)->ip_frag.flags = IP_FRAG_FLAG_IP6_HEADER;
+ next0 = IP4_MAP_REASS_NEXT_IP4_FRAGMENT;
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_ip4_map_reass_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = port0;
+ tr->cached = cached;
+ }
+
+ if(cached) {
+ //Dequeue the packet
+ n_left_to_next++;
+ to_next--;
+ } else {
+ if (error0 == MAP_ERROR_NONE)
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip60->payload_length) + 40);
+ next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP4_MAP_REASS_NEXT_DROP;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+
+ //Loopback when we reach the end of the inpu vector
+ if(n_left_from == 0 && vec_len(fragments_to_loopback)) {
+ from = vlib_frame_vector_args(frame);
+ u32 len = vec_len(fragments_to_loopback);
+ if(len <= VLIB_FRAME_SIZE) {
+ memcpy(from, fragments_to_loopback, sizeof(u32)*len);
+ n_left_from = len;
+ vec_reset_length(fragments_to_loopback);
+ } else {
+ memcpy(from, fragments_to_loopback + (len - VLIB_FRAME_SIZE), sizeof(u32)*VLIB_FRAME_SIZE);
+ n_left_from = VLIB_FRAME_SIZE;
+ _vec_len(fragments_to_loopback) = len - VLIB_FRAME_SIZE;
+ }
+ }
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ map_send_all_to_node(vm, fragments_to_drop, node,
+ &error_node->errors[MAP_ERROR_FRAGMENT_DROPPED],
+ IP4_MAP_REASS_NEXT_DROP);
+
+ vec_free(fragments_to_drop);
+ vec_free(fragments_to_loopback);
+ return frame->n_vectors;
+}
+
+static char *map_error_strings[] = {
+#define _(sym,string) string,
+ foreach_map_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip4_map_node) = {
+ .function = ip4_map,
+ .name = "ip4-map",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+
+ .n_next_nodes = IP4_MAP_N_NEXT,
+ .next_nodes = {
+ [IP4_MAP_NEXT_IP6_LOOKUP] = "ip6-lookup",
+#ifdef MAP_SKIP_IP6_LOOKUP
+ [IP4_MAP_NEXT_IP6_REWRITE] = "ip6-rewrite",
+#endif
+ [IP4_MAP_NEXT_FRAGMENT] = "ip4-frag",
+ [IP4_MAP_NEXT_REASS] = "ip4-map-reass",
+ [IP4_MAP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip4_map_reass_node) = {
+ .function = ip4_map_reass,
+ .name = "ip4-map-reass",
+ .vector_size = sizeof(u32),
+ .format_trace = format_ip4_map_reass_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+
+ .n_next_nodes = IP4_MAP_REASS_N_NEXT,
+ .next_nodes = {
+ [IP4_MAP_REASS_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_MAP_REASS_NEXT_IP4_FRAGMENT] = "ip4-frag",
+ [IP4_MAP_REASS_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/ip4_map_t.c b/vnet/vnet/map/ip4_map_t.c
new file mode 100644
index 00000000000..07f5b19c257
--- /dev/null
+++ b/vnet/vnet/map/ip4_map_t.c
@@ -0,0 +1,1092 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "map.h"
+
+#include "../ip/ip_frag.h"
+
+#define IP4_MAP_T_DUAL_LOOP 1
+
+typedef enum {
+ IP4_MAPT_NEXT_MAPT_TCP_UDP,
+ IP4_MAPT_NEXT_MAPT_ICMP,
+ IP4_MAPT_NEXT_MAPT_FRAGMENTED,
+ IP4_MAPT_NEXT_DROP,
+ IP4_MAPT_N_NEXT
+} ip4_mapt_next_t;
+
+typedef enum {
+ IP4_MAPT_ICMP_NEXT_IP6_LOOKUP,
+ IP4_MAPT_ICMP_NEXT_IP6_FRAG,
+ IP4_MAPT_ICMP_NEXT_DROP,
+ IP4_MAPT_ICMP_N_NEXT
+} ip4_mapt_icmp_next_t;
+
+typedef enum {
+ IP4_MAPT_TCP_UDP_NEXT_IP6_LOOKUP,
+ IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG,
+ IP4_MAPT_TCP_UDP_NEXT_DROP,
+ IP4_MAPT_TCP_UDP_N_NEXT
+} ip4_mapt_tcp_udp_next_t;
+
+typedef enum {
+ IP4_MAPT_FRAGMENTED_NEXT_IP6_LOOKUP,
+ IP4_MAPT_FRAGMENTED_NEXT_IP6_FRAG,
+ IP4_MAPT_FRAGMENTED_NEXT_DROP,
+ IP4_MAPT_FRAGMENTED_N_NEXT
+} ip4_mapt_fragmented_next_t;
+
+//This is used to pass information within the buffer data.
+//Buffer structure being too small to contain big structures like this.
+typedef CLIB_PACKED(struct {
+ ip6_address_t daddr;
+ ip6_address_t saddr;
+ //IPv6 header + Fragmentation header will be here
+ //sizeof(ip6) + sizeof(ip_frag) - sizeof(ip4)
+ u8 unused[28];
+}) ip4_mapt_pseudo_header_t;
+
+#define frag_id_4to6(id) (id)
+
+//TODO: Find the right place in memory for this.
+static u8 icmp_to_icmp6_updater_pointer_table[] =
+ { 0, 1, 4, 4,~0,
+ ~0,~0,~0, 7, 6,
+ ~0,~0, 8, 8, 8,
+ 8, 24, 24, 24, 24 };
+
+
+static_always_inline int
+ip4_map_fragment_cache (ip4_header_t *ip4, u16 port)
+{
+ u32 *ignore = NULL;
+ map_ip4_reass_lock();
+ map_ip4_reass_t *r = map_ip4_reass_get(ip4->src_address.as_u32, ip4->dst_address.as_u32,
+ ip4->fragment_id,
+ (ip4->protocol == IP_PROTOCOL_ICMP) ? IP_PROTOCOL_ICMP6 : ip4->protocol,
+ &ignore);
+ if (r)
+ r->port = port;
+
+ map_ip4_reass_unlock();
+ return !r;
+}
+
+static_always_inline i32
+ip4_map_fragment_get_port (ip4_header_t *ip4)
+{
+ u32 *ignore = NULL;
+ map_ip4_reass_lock();
+ map_ip4_reass_t *r = map_ip4_reass_get(ip4->src_address.as_u32, ip4->dst_address.as_u32,
+ ip4->fragment_id,
+ (ip4->protocol == IP_PROTOCOL_ICMP) ? IP_PROTOCOL_ICMP6 : ip4->protocol,
+ &ignore);
+ i32 ret = r?r->port:-1;
+ map_ip4_reass_unlock();
+ return ret;
+}
+
+
+/* Statelessly translates an ICMP packet into ICMPv6.
+ *
+ * Warning: The checksum will need to be recomputed.
+ *
+ */
+static_always_inline int
+ip4_icmp_to_icmp6_in_place (icmp46_header_t *icmp, u32 icmp_len,
+ i32 *receiver_port, ip4_header_t **inner_ip4)
+{
+ *inner_ip4 = NULL;
+ switch (icmp->type) {
+ case ICMP4_echo_reply:
+ *receiver_port = ((u16 *)icmp)[2];
+ icmp->type = ICMP6_echo_reply;
+ break;
+ case ICMP4_echo_request:
+ *receiver_port = ((u16 *)icmp)[2];
+ icmp->type = ICMP6_echo_request;
+ break;
+ case ICMP4_destination_unreachable:
+ *inner_ip4 = (ip4_header_t *)(((u8 *) icmp) + 8);
+ *receiver_port = ip4_get_port(*inner_ip4, MAP_SENDER, icmp_len - 8);
+
+ switch (icmp->code) {
+ case ICMP4_destination_unreachable_destination_unreachable_net: //0
+ case ICMP4_destination_unreachable_destination_unreachable_host: //1
+ icmp->type = ICMP6_destination_unreachable;
+ icmp->code = ICMP6_destination_unreachable_no_route_to_destination;
+ break;
+ case ICMP4_destination_unreachable_protocol_unreachable: //2
+ icmp->type = ICMP6_parameter_problem;
+ icmp->code = ICMP6_parameter_problem_unrecognized_next_header;
+ break;
+ case ICMP4_destination_unreachable_port_unreachable: //3
+ icmp->type = ICMP6_destination_unreachable;
+ icmp->code = ICMP6_destination_unreachable_port_unreachable;
+ break;
+ case ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set: //4
+ icmp->type = ICMP6_packet_too_big;
+ icmp->code = 0;
+ {
+ u32 advertised_mtu = clib_net_to_host_u32(*((u32 *)(icmp + 1)));
+ if (advertised_mtu)
+ advertised_mtu += 20;
+ else
+ advertised_mtu = 1000; //FIXME ! (RFC 1191 - plateau value)
+
+ //FIXME: = minimum(advertised MTU+20, MTU_of_IPv6_nexthop, (MTU_of_IPv4_nexthop)+20)
+ *((u32 *)(icmp + 1)) = clib_host_to_net_u32(advertised_mtu);
+ }
+ break;
+
+ case ICMP4_destination_unreachable_source_route_failed: //5
+ case ICMP4_destination_unreachable_destination_network_unknown: //6
+ case ICMP4_destination_unreachable_destination_host_unknown: //7
+ case ICMP4_destination_unreachable_source_host_isolated: //8
+ case ICMP4_destination_unreachable_network_unreachable_for_type_of_service: //11
+ case ICMP4_destination_unreachable_host_unreachable_for_type_of_service: //12
+ icmp->type = ICMP6_destination_unreachable;
+ icmp->code = ICMP6_destination_unreachable_no_route_to_destination;
+ break;
+ case ICMP4_destination_unreachable_network_administratively_prohibited: //9
+ case ICMP4_destination_unreachable_host_administratively_prohibited: //10
+ case ICMP4_destination_unreachable_communication_administratively_prohibited: //13
+ case ICMP4_destination_unreachable_precedence_cutoff_in_effect: //15
+ icmp->type = ICMP6_destination_unreachable;
+ icmp->code = ICMP6_destination_unreachable_destination_administratively_prohibited;
+ break;
+ case ICMP4_destination_unreachable_host_precedence_violation: //14
+ default:
+ return -1;
+ }
+ break;
+
+ case ICMP4_time_exceeded: //11
+ *inner_ip4 = (ip4_header_t *)(((u8 *) icmp) + 8);
+ *receiver_port = ip4_get_port(*inner_ip4, MAP_SENDER, icmp_len - 8);
+ icmp->type = ICMP6_time_exceeded;
+ //icmp->code = icmp->code //unchanged
+ break;
+
+ case ICMP4_parameter_problem:
+ *inner_ip4 = (ip4_header_t *)(((u8 *) icmp) + 8);
+ *receiver_port = ip4_get_port(*inner_ip4, MAP_SENDER, icmp_len - 8);
+
+ switch (icmp->code) {
+ case ICMP4_parameter_problem_pointer_indicates_error:
+ case ICMP4_parameter_problem_bad_length:
+ icmp->type = ICMP6_parameter_problem;
+ icmp->code = ICMP6_parameter_problem_erroneous_header_field;
+ {
+ u8 ptr = icmp_to_icmp6_updater_pointer_table[*((u8 *)(icmp + 1))];
+ if (ptr == 0xff)
+ return -1;
+
+ *((u32 *)(icmp + 1)) = clib_host_to_net_u32(ptr);
+ }
+ break;
+ default:
+ //All other codes cause dropping the packet
+ return -1;
+ }
+ break;
+
+ default:
+ //All other types cause dropping the packet
+ return -1;
+ break;
+ }
+ return 0;
+}
+
+static_always_inline void
+_ip4_map_t_icmp (map_domain_t *d, vlib_buffer_t *p, u8 *error)
+{
+ ip4_header_t *ip4, *inner_ip4;
+ ip6_header_t *ip6, *inner_ip6;
+ u32 ip_len;
+ icmp46_header_t *icmp;
+ i32 recv_port;
+ ip_csum_t csum;
+ u16 *inner_L4_checksum = 0;
+ ip6_frag_hdr_t *inner_frag;
+ u32 inner_frag_id;
+ u32 inner_frag_offset;
+ u8 inner_frag_more;
+
+ ip4 = vlib_buffer_get_current(p);
+ ip_len = clib_net_to_host_u16(ip4->length);
+ ASSERT(ip_len <= p->current_length);
+
+ icmp = (icmp46_header_t *)(ip4 + 1);
+ if (ip4_icmp_to_icmp6_in_place(icmp, ip_len - sizeof(*ip4),
+ &recv_port, &inner_ip4)) {
+ *error = MAP_ERROR_ICMP;
+ return;
+ }
+
+ if (recv_port < 0) {
+ // In case of 1:1 mapping, we don't care about the port
+ if(d->ea_bits_len == 0 && d->rules) {
+ recv_port = 0;
+ } else {
+ *error = MAP_ERROR_ICMP;
+ return;
+ }
+ }
+
+ if (inner_ip4) {
+ //We have 2 headers to translate.
+ //We need to make some room in the middle of the packet
+
+ if (PREDICT_FALSE(ip4_is_fragment(inner_ip4))) {
+ //Here it starts getting really tricky
+ //We will add a fragmentation header in the inner packet
+
+ if (!ip4_is_first_fragment(inner_ip4)) {
+ //For now we do not handle unless it is the first fragment
+ //Ideally we should handle the case as we are in slow path already
+ *error = MAP_ERROR_FRAGMENTED;
+ return;
+ }
+
+ vlib_buffer_advance(p, - 2*(sizeof(*ip6) - sizeof(*ip4)) - sizeof(*inner_frag));
+ ip6 = vlib_buffer_get_current(p);
+ memcpy(u8_ptr_add(ip6, sizeof(*ip6) - sizeof(*ip4)), ip4, 20 + 8);
+ ip4 = (ip4_header_t *) u8_ptr_add(ip6, sizeof(*ip6) - sizeof(*ip4));
+ icmp = (icmp46_header_t *) (ip4 + 1);
+
+ inner_ip6 = (ip6_header_t *) u8_ptr_add(inner_ip4, sizeof(*ip4) - sizeof(*ip6) - sizeof(*inner_frag));
+ inner_frag = (ip6_frag_hdr_t *) u8_ptr_add(inner_ip6, sizeof(*inner_ip6));
+ ip6->payload_length = u16_net_add(ip4->length, sizeof(*ip6) - 2*sizeof(*ip4) + sizeof(*inner_frag));
+ inner_frag_id = frag_id_4to6(inner_ip4->fragment_id);
+ inner_frag_offset = ip4_get_fragment_offset(inner_ip4);
+ inner_frag_more = !!(inner_ip4->flags_and_fragment_offset & clib_net_to_host_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS));
+ } else {
+ vlib_buffer_advance(p, - 2*(sizeof(*ip6) - sizeof(*ip4)));
+ ip6 = vlib_buffer_get_current(p);
+ memcpy(u8_ptr_add(ip6, sizeof(*ip6) - sizeof(*ip4)), ip4, 20 + 8);
+ ip4 = (ip4_header_t *) u8_ptr_add(ip6, sizeof(*ip6) - sizeof(*ip4));
+ icmp = (icmp46_header_t *) u8_ptr_add(ip4, sizeof(*ip4));
+ inner_ip6 = (ip6_header_t *) u8_ptr_add(inner_ip4, sizeof(*ip4) - sizeof(*ip6));
+ ip6->payload_length = u16_net_add(ip4->length, sizeof(*ip6) - 2*sizeof(*ip4));
+ inner_frag = NULL;
+ }
+
+ if (PREDICT_TRUE(inner_ip4->protocol == IP_PROTOCOL_TCP)) {
+ inner_L4_checksum = &((tcp_header_t *) (inner_ip4 + 1))->checksum;
+ *inner_L4_checksum = ip_csum_fold(ip_csum_sub_even(*inner_L4_checksum, *((u64 *) (&inner_ip4->src_address))));
+ } else if (PREDICT_TRUE(inner_ip4->protocol == IP_PROTOCOL_UDP)) {
+ inner_L4_checksum = &((udp_header_t *) (inner_ip4 + 1))->checksum;
+ if (!*inner_L4_checksum) {
+ //The inner packet was first translated, and therefore came from IPv6.
+ //As the packet was an IPv6 packet, the UDP checksum can't be NULL
+ *error = MAP_ERROR_ICMP;
+ return;
+ }
+ *inner_L4_checksum = ip_csum_fold(ip_csum_sub_even(*inner_L4_checksum, *((u64 *)(&inner_ip4->src_address))));
+ } else if (inner_ip4->protocol == IP_PROTOCOL_ICMP) {
+ //We have an ICMP inside an ICMP
+ //It needs to be translated, but not for error ICMP messages
+ icmp46_header_t *inner_icmp = (icmp46_header_t *) (inner_ip4 + 1);
+ csum = inner_icmp->checksum;
+ //Only types ICMP4_echo_request and ICMP4_echo_reply are handled by ip4_icmp_to_icmp6_in_place
+ csum = ip_csum_sub_even(csum, *((u16 *)inner_icmp));
+ inner_icmp->type = (inner_icmp->type == ICMP4_echo_request)?
+ ICMP6_echo_request:ICMP6_echo_reply;
+ csum = ip_csum_add_even(csum, *((u16 *)inner_icmp));
+ csum = ip_csum_add_even(csum, clib_host_to_net_u16(IP_PROTOCOL_ICMP6));
+ csum = ip_csum_add_even(csum, inner_ip4->length - sizeof(*inner_ip4));
+ inner_icmp->checksum = ip_csum_fold(csum);
+ inner_L4_checksum = &inner_icmp->checksum;
+ inner_ip4->protocol = IP_PROTOCOL_ICMP6;
+ } else {
+ ASSERT(0); // We had a port from that, so it is udp or tcp or ICMP
+ }
+
+ //FIXME: Security check with the port found in the inner packet
+
+ csum = *inner_L4_checksum; //Initial checksum of the inner L4 header
+ //FIXME: Shouldn't we remove ip addresses from there ?
+
+ inner_ip6->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (inner_ip4->tos << 20));
+ inner_ip6->payload_length = u16_net_add(inner_ip4->length, - sizeof(*inner_ip4));
+ inner_ip6->hop_limit = inner_ip4->ttl;
+ inner_ip6->protocol = inner_ip4->protocol;
+
+ //Note that the source address is within the domain
+ //while the destination address is the one outside the domain
+ ip4_map_t_embedded_address(d, &inner_ip6->dst_address, &inner_ip4->dst_address);
+ inner_ip6->src_address.as_u64[0] = map_get_pfx_net(d, inner_ip4->src_address.as_u32, recv_port);
+ inner_ip6->src_address.as_u64[1] = map_get_sfx_net(d, inner_ip4->src_address.as_u32, recv_port);
+
+ if (PREDICT_FALSE(inner_frag != NULL)) {
+ inner_frag->next_hdr = inner_ip6->protocol;
+ inner_frag->identification = inner_frag_id;
+ inner_frag->rsv = 0;
+ inner_frag->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(inner_frag_offset, inner_frag_more);
+ inner_ip6->protocol = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ inner_ip6->payload_length = clib_host_to_net_u16(
+ clib_net_to_host_u16(inner_ip6->payload_length) + sizeof(*inner_frag));
+ }
+
+ csum = ip_csum_add_even(csum, inner_ip6->src_address.as_u64[0]);
+ csum = ip_csum_add_even(csum, inner_ip6->src_address.as_u64[1]);
+ csum = ip_csum_add_even(csum, inner_ip6->dst_address.as_u64[0]);
+ csum = ip_csum_add_even(csum, inner_ip6->dst_address.as_u64[1]);
+ *inner_L4_checksum = ip_csum_fold(csum);
+
+ } else {
+ vlib_buffer_advance(p, sizeof(*ip4) - sizeof(*ip6));
+ ip6 = vlib_buffer_get_current(p);
+ ip6->payload_length = clib_host_to_net_u16(clib_net_to_host_u16(ip4->length) - sizeof(*ip4));
+ }
+
+ //Translate outer IPv6
+ ip6->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (ip4->tos << 20));
+
+ ip6->hop_limit = ip4->ttl;
+ ip6->protocol = IP_PROTOCOL_ICMP6;
+
+ ip4_map_t_embedded_address(d, &ip6->src_address, &ip4->src_address);
+ ip6->dst_address.as_u64[0] = map_get_pfx_net(d, ip4->dst_address.as_u32, recv_port);
+ ip6->dst_address.as_u64[1] = map_get_sfx_net(d, ip4->dst_address.as_u32, recv_port);
+
+ //Truncate when the packet exceeds the minimal IPv6 MTU
+ if (p->current_length > 1280) {
+ ip6->payload_length = clib_host_to_net_u16(1280 - sizeof(*ip6));
+ p->current_length = 1280; //Looks too simple to be correct...
+ }
+
+ //TODO: We could do an easy diff-checksum for echo requests/replies
+ //Recompute ICMP checksum
+ icmp->checksum = 0;
+ csum = ip_csum_with_carry(0, ip6->payload_length);
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(ip6->protocol));
+ csum = ip_csum_with_carry(csum, ip6->src_address.as_u64[0]);
+ csum = ip_csum_with_carry(csum, ip6->src_address.as_u64[1]);
+ csum = ip_csum_with_carry(csum, ip6->dst_address.as_u64[0]);
+ csum = ip_csum_with_carry(csum, ip6->dst_address.as_u64[1]);
+ csum = ip_incremental_checksum(csum, icmp, clib_net_to_host_u16(ip6->payload_length));
+ icmp->checksum = ~ip_csum_fold (csum);
+}
+
+static uword
+ip4_map_t_icmp (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_map_t_icmp_node.index);
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ vlib_combined_counter_main_t *cm = map_main.domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip4_mapt_icmp_next_t next0;
+ u8 error0;
+ map_domain_t *d0;
+ u16 len0;
+
+ next0 = IP4_MAPT_ICMP_NEXT_IP6_LOOKUP;
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ error0 = MAP_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ vlib_buffer_advance(p0, sizeof(ip4_mapt_pseudo_header_t)); //The pseudo-header is not used
+ len0 = clib_net_to_host_u16(((ip4_header_t *)vlib_buffer_get_current(p0))->length);
+ d0 = pool_elt_at_index(map_main.domains, vnet_buffer(p0)->map_t.map_domain_index);
+ _ip4_map_t_icmp(d0, p0, &error0);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+ next0 = IP4_MAPT_ICMP_NEXT_IP6_FRAG;
+ }
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ len0);
+ } else {
+ next0 = IP4_MAPT_ICMP_NEXT_DROP;
+ }
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static uword
+ip4_map_t_fragmented (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ ip6_frag_hdr_t *frag0;
+ ip4_mapt_pseudo_header_t *pheader0;
+ ip4_mapt_fragmented_next_t next0;
+
+ next0 = IP4_MAPT_FRAGMENTED_NEXT_IP6_LOOKUP;
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+
+ //Accessing pseudo header
+ pheader0 = vlib_buffer_get_current(p0);
+ vlib_buffer_advance(p0, sizeof(*pheader0));
+
+ //Accessing ip4 header
+ ip40 = vlib_buffer_get_current(p0);
+ frag0 = (ip6_frag_hdr_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*frag0));
+ ip60 = (ip6_header_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*frag0) - sizeof(*ip60));
+ vlib_buffer_advance(p0, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+
+ //We know that the protocol was one of ICMP, TCP or UDP
+ //because the first fragment was found and cached
+ frag0->next_hdr = (ip40->protocol == IP_PROTOCOL_ICMP) ? IP_PROTOCOL_ICMP6 : ip40->protocol;
+ frag0->identification = frag_id_4to6(ip40->fragment_id);
+ frag0->rsv = 0;
+ frag0->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(
+ ip4_get_fragment_offset(ip40),
+ clib_net_to_host_u16(ip40->flags_and_fragment_offset) & IP4_HEADER_FLAG_MORE_FRAGMENTS);
+
+ ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (ip40->tos << 20));
+ ip60->payload_length = clib_host_to_net_u16(clib_net_to_host_u16(ip40->length) - sizeof(*ip40) + sizeof(*frag0));
+ ip60->hop_limit = ip40->ttl;
+ ip60->protocol = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ ip60->dst_address.as_u64[0] = pheader0->daddr.as_u64[0];
+ ip60->dst_address.as_u64[1] = pheader0->daddr.as_u64[1];
+ ip60->src_address.as_u64[0] = pheader0->saddr.as_u64[0];
+ ip60->src_address.as_u64[1] = pheader0->saddr.as_u64[1];
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+ next0 = IP4_MAPT_FRAGMENTED_NEXT_IP6_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static uword
+ip4_map_t_tcp_udp(vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+#ifdef IP4_MAP_T_DUAL_LOOP
+ while (n_left_from >= 4 && n_left_to_next >= 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ ip4_header_t *ip40, *ip41;
+ ip6_header_t *ip60, *ip61;
+ ip_csum_t csum0, csum1;
+ u16 *checksum0, *checksum1;
+ ip6_frag_hdr_t *frag0, *frag1;
+ u32 frag_id0, frag_id1;
+ ip4_mapt_pseudo_header_t *pheader0, *pheader1;
+ ip4_mapt_tcp_udp_next_t next0, next1;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next +=2;
+ n_left_to_next -= 2;
+
+ next0 = IP4_MAPT_TCP_UDP_NEXT_IP6_LOOKUP;
+ next1 = IP4_MAPT_TCP_UDP_NEXT_IP6_LOOKUP;
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+
+ //Accessing pseudo header
+ pheader0 = vlib_buffer_get_current(p0);
+ pheader1 = vlib_buffer_get_current(p1);
+ vlib_buffer_advance(p0, sizeof(*pheader0));
+ vlib_buffer_advance(p1, sizeof(*pheader1));
+
+ //Accessing ip4 header
+ ip40 = vlib_buffer_get_current(p0);
+ ip41 = vlib_buffer_get_current(p1);
+ checksum0 = (u16 *) u8_ptr_add(ip40, vnet_buffer(p0)->map_t.checksum_offset);
+ checksum1 = (u16 *) u8_ptr_add(ip41, vnet_buffer(p1)->map_t.checksum_offset);
+
+ //UDP checksum is optional over IPv4 but mandatory for IPv6
+ //We do not check udp->length sanity but use our safe computed value instead
+ if (PREDICT_FALSE(!*checksum0 && ip40->protocol == IP_PROTOCOL_UDP)) {
+ u16 udp_len = clib_host_to_net_u16(ip40->length) - sizeof(*ip40);
+ udp_header_t *udp = (udp_header_t *) u8_ptr_add(ip40, sizeof(*ip40));
+ ip_csum_t csum;
+ csum = ip_incremental_checksum(0, udp, udp_len);
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(udp_len));
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(IP_PROTOCOL_UDP));
+ csum = ip_csum_with_carry(csum, *((u64 *)(&ip40->src_address)));
+ *checksum0 = ~ip_csum_fold(csum);
+ }
+ if (PREDICT_FALSE(!*checksum1 && ip41->protocol == IP_PROTOCOL_UDP)) {
+ u16 udp_len = clib_host_to_net_u16(ip41->length) - sizeof(*ip40);
+ udp_header_t *udp = (udp_header_t *) u8_ptr_add(ip41, sizeof(*ip40));
+ ip_csum_t csum;
+ csum = ip_incremental_checksum(0, udp, udp_len);
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(udp_len));
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(IP_PROTOCOL_UDP));
+ csum = ip_csum_with_carry(csum, *((u64 *)(&ip41->src_address)));
+ *checksum1 = ~ip_csum_fold(csum);
+ }
+
+ csum0 = ip_csum_sub_even(*checksum0, ip40->src_address.as_u32);
+ csum1 = ip_csum_sub_even(*checksum1, ip41->src_address.as_u32);
+ csum0 = ip_csum_sub_even(csum0, ip40->dst_address.as_u32);
+ csum1 = ip_csum_sub_even(csum1, ip41->dst_address.as_u32);
+
+ // Deal with fragmented packets
+ if (PREDICT_FALSE(ip40->flags_and_fragment_offset &
+ clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS))) {
+ ip60 = (ip6_header_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ frag0 = (ip6_frag_hdr_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*frag0));
+ frag_id0 = frag_id_4to6(ip40->fragment_id);
+ vlib_buffer_advance(p0, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ } else {
+ ip60 = (ip6_header_t *) (((u8 *)ip40) + sizeof(*ip40) - sizeof(*ip60));
+ vlib_buffer_advance(p0, sizeof(*ip40) - sizeof(*ip60));
+ frag0 = NULL;
+ }
+
+ if (PREDICT_FALSE(ip41->flags_and_fragment_offset &
+ clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS))) {
+ ip61 = (ip6_header_t *) u8_ptr_add(ip41, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ frag1 = (ip6_frag_hdr_t *) u8_ptr_add(ip41, sizeof(*ip40) - sizeof(*frag0));
+ frag_id1 = frag_id_4to6(ip41->fragment_id);
+ vlib_buffer_advance(p1, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ } else {
+ ip61 = (ip6_header_t *) (((u8 *)ip41) + sizeof(*ip40) - sizeof(*ip60));
+ vlib_buffer_advance(p1, sizeof(*ip40) - sizeof(*ip60));
+ frag1 = NULL;
+ }
+
+ ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (ip40->tos << 20));
+ ip61->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (ip41->tos << 20));
+ ip60->payload_length = u16_net_add(ip40->length, - sizeof(*ip40));
+ ip61->payload_length = u16_net_add(ip41->length, - sizeof(*ip40));
+ ip60->hop_limit = ip40->ttl;
+ ip61->hop_limit = ip41->ttl;
+ ip60->protocol = ip40->protocol;
+ ip61->protocol = ip41->protocol;
+
+ if (PREDICT_FALSE(frag0 != NULL)) {
+ frag0->next_hdr = ip60->protocol;
+ frag0->identification = frag_id0;
+ frag0->rsv = 0;
+ frag0->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(0, 1);
+ ip60->protocol = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ ip60->payload_length = u16_net_add(ip60->payload_length, sizeof(*frag0));
+ }
+
+ if (PREDICT_FALSE(frag1 != NULL)) {
+ frag1->next_hdr = ip61->protocol;
+ frag1->identification = frag_id1;
+ frag1->rsv = 0;
+ frag1->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(0, 1);
+ ip61->protocol = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ ip61->payload_length = u16_net_add(ip61->payload_length, sizeof(*frag0));
+ }
+
+ //Finally copying the address
+ ip60->dst_address.as_u64[0] = pheader0->daddr.as_u64[0];
+ ip61->dst_address.as_u64[0] = pheader1->daddr.as_u64[0];
+ ip60->dst_address.as_u64[1] = pheader0->daddr.as_u64[1];
+ ip61->dst_address.as_u64[1] = pheader1->daddr.as_u64[1];
+ ip60->src_address.as_u64[0] = pheader0->saddr.as_u64[0];
+ ip61->src_address.as_u64[0] = pheader1->saddr.as_u64[0];
+ ip60->src_address.as_u64[1] = pheader0->saddr.as_u64[1];
+ ip61->src_address.as_u64[1] = pheader1->saddr.as_u64[1];
+
+ csum0 = ip_csum_add_even(csum0, ip60->src_address.as_u64[0]);
+ csum1 = ip_csum_add_even(csum1, ip61->src_address.as_u64[0]);
+ csum0 = ip_csum_add_even(csum0, ip60->src_address.as_u64[1]);
+ csum1 = ip_csum_add_even(csum1, ip61->src_address.as_u64[1]);
+ csum0 = ip_csum_add_even(csum0, ip60->dst_address.as_u64[0]);
+ csum1 = ip_csum_add_even(csum1, ip61->dst_address.as_u64[0]);
+ csum0 = ip_csum_add_even(csum0, ip60->dst_address.as_u64[1]);
+ csum1 = ip_csum_add_even(csum1, ip61->dst_address.as_u64[1]);
+ *checksum0 = ip_csum_fold(csum0);
+ *checksum1 = ip_csum_fold(csum1);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+ next0 = IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG;
+ }
+
+ if(vnet_buffer(p1)->map_t.mtu < p1->current_length) {
+ vnet_buffer(p1)->ip_frag.header_offset = 0;
+ vnet_buffer(p1)->ip_frag.mtu = vnet_buffer(p1)->map_t.mtu;
+ vnet_buffer(p1)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+ next1 = IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index,
+ to_next, n_left_to_next, pi0, pi1,
+ next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ ip_csum_t csum0;
+ u16 *checksum0;
+ ip6_frag_hdr_t *frag0;
+ u32 frag_id0;
+ ip4_mapt_pseudo_header_t *pheader0;
+ ip4_mapt_tcp_udp_next_t next0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ next0 = IP4_MAPT_TCP_UDP_NEXT_IP6_LOOKUP;
+ p0 = vlib_get_buffer(vm, pi0);
+
+ //Accessing pseudo header
+ pheader0 = vlib_buffer_get_current(p0);
+ vlib_buffer_advance(p0, sizeof(*pheader0));
+
+ //Accessing ip4 header
+ ip40 = vlib_buffer_get_current(p0);
+ checksum0 = (u16 *) u8_ptr_add(ip40, vnet_buffer(p0)->map_t.checksum_offset);
+
+ //UDP checksum is optional over IPv4 but mandatory for IPv6
+ //We do not check udp->length sanity but use our safe computed value instead
+ if (PREDICT_FALSE(!*checksum0 && ip40->protocol == IP_PROTOCOL_UDP)) {
+ u16 udp_len = clib_host_to_net_u16(ip40->length) - sizeof(*ip40);
+ udp_header_t *udp = (udp_header_t *) u8_ptr_add(ip40, sizeof(*ip40));
+ ip_csum_t csum;
+ csum = ip_incremental_checksum(0, udp, udp_len);
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(udp_len));
+ csum = ip_csum_with_carry(csum, clib_host_to_net_u16(IP_PROTOCOL_UDP));
+ csum = ip_csum_with_carry(csum, *((u64 *)(&ip40->src_address)));
+ *checksum0 = ~ip_csum_fold(csum);
+ }
+
+ csum0 = ip_csum_sub_even(*checksum0, ip40->src_address.as_u32);
+ csum0 = ip_csum_sub_even(csum0, ip40->dst_address.as_u32);
+
+ // Deal with fragmented packets
+ if (PREDICT_FALSE(ip40->flags_and_fragment_offset &
+ clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS))) {
+ ip60 = (ip6_header_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ frag0 = (ip6_frag_hdr_t *) u8_ptr_add(ip40, sizeof(*ip40) - sizeof(*frag0));
+ frag_id0 = frag_id_4to6(ip40->fragment_id);
+ vlib_buffer_advance(p0, sizeof(*ip40) - sizeof(*ip60) - sizeof(*frag0));
+ } else {
+ ip60 = (ip6_header_t *) (((u8 *)ip40) + sizeof(*ip40) - sizeof(*ip60));
+ vlib_buffer_advance(p0, sizeof(*ip40) - sizeof(*ip60));
+ frag0 = NULL;
+ }
+
+ ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32((6 << 28) + (ip40->tos << 20));
+ ip60->payload_length = u16_net_add(ip40->length, - sizeof(*ip40));
+ ip60->hop_limit = ip40->ttl;
+ ip60->protocol = ip40->protocol;
+
+ if (PREDICT_FALSE(frag0 != NULL)) {
+ frag0->next_hdr = ip60->protocol;
+ frag0->identification = frag_id0;
+ frag0->rsv = 0;
+ frag0->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(0, 1);
+ ip60->protocol = IP_PROTOCOL_IPV6_FRAGMENTATION;
+ ip60->payload_length = u16_net_add(ip60->payload_length, sizeof(*frag0));
+ }
+
+ //Finally copying the address
+ ip60->dst_address.as_u64[0] = pheader0->daddr.as_u64[0];
+ ip60->dst_address.as_u64[1] = pheader0->daddr.as_u64[1];
+ ip60->src_address.as_u64[0] = pheader0->saddr.as_u64[0];
+ ip60->src_address.as_u64[1] = pheader0->saddr.as_u64[1];
+
+ csum0 = ip_csum_add_even(csum0, ip60->src_address.as_u64[0]);
+ csum0 = ip_csum_add_even(csum0, ip60->src_address.as_u64[1]);
+ csum0 = ip_csum_add_even(csum0, ip60->dst_address.as_u64[0]);
+ csum0 = ip_csum_add_even(csum0, ip60->dst_address.as_u64[1]);
+ *checksum0 = ip_csum_fold(csum0);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ //Send to fragmentation node if necessary
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.next_index = IP6_FRAG_NEXT_IP6_LOOKUP;
+ next0 = IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+static_always_inline void
+ip4_map_t_classify(vlib_buffer_t *p0, map_domain_t *d0, ip4_header_t *ip40, u16 ip4_len0,
+ i32 *dst_port0, u8 *error0, ip4_mapt_next_t *next0)
+{
+ if (PREDICT_FALSE(ip4_get_fragment_offset(ip40))) {
+ *next0 = IP4_MAPT_NEXT_MAPT_FRAGMENTED;
+ if(d0->ea_bits_len == 0 && d0->rules) {
+ *dst_port0 = 0;
+ } else {
+ *dst_port0 = ip4_map_fragment_get_port(ip40);
+ *error0 = (*dst_port0 == -1) ? MAP_ERROR_FRAGMENT_MEMORY : *error0;
+ }
+ } else if (PREDICT_TRUE(ip40->protocol == IP_PROTOCOL_TCP)) {
+ vnet_buffer(p0)->map_t.checksum_offset = 36;
+ *next0 = IP4_MAPT_NEXT_MAPT_TCP_UDP;
+ *error0 = ip4_len0 < 40 ? MAP_ERROR_MALFORMED : *error0;
+ *dst_port0 = (i32) *((u16 *)u8_ptr_add(ip40, sizeof(*ip40) + 2));
+ } else if (PREDICT_TRUE(ip40->protocol == IP_PROTOCOL_UDP)) {
+ vnet_buffer(p0)->map_t.checksum_offset = 26;
+ *next0 = IP4_MAPT_NEXT_MAPT_TCP_UDP;
+ *error0 = ip4_len0 < 28 ? MAP_ERROR_MALFORMED : *error0;
+ *dst_port0 = (i32) *((u16 *)u8_ptr_add(ip40, sizeof(*ip40) + 2));
+ } else if (ip40->protocol == IP_PROTOCOL_ICMP) {
+ *next0 = IP4_MAPT_NEXT_MAPT_ICMP;
+ if(d0->ea_bits_len == 0 && d0->rules)
+ *dst_port0 = 0;
+ else if (((icmp46_header_t *) u8_ptr_add(ip40, sizeof(*ip40)))->code == ICMP4_echo_reply ||
+ ((icmp46_header_t *) u8_ptr_add(ip40, sizeof(*ip40)))->code == ICMP4_echo_request)
+ *dst_port0 = (i32) *((u16 *)u8_ptr_add(ip40, sizeof(*ip40) + 6));
+ } else {
+ *error0 = MAP_ERROR_BAD_PROTOCOL;
+ }
+}
+
+static uword
+ip4_map_t (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_map_t_node.index);
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ vlib_combined_counter_main_t *cm = map_main.domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+#ifdef IP4_MAP_T_DUAL_LOOP
+ while (n_left_from >= 4 && n_left_to_next >= 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ ip4_header_t *ip40, *ip41;
+ map_domain_t *d0, *d1;
+ ip4_mapt_next_t next0, next1;
+ u16 ip4_len0, ip4_len1;
+ u8 error0, error1;
+ i32 dst_port0, dst_port1;
+ ip4_mapt_pseudo_header_t *pheader0, *pheader1;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next +=2;
+ n_left_to_next -= 2;
+ error0 = MAP_ERROR_NONE;
+ error1 = MAP_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip40 = vlib_buffer_get_current(p0);
+ ip41 = vlib_buffer_get_current(p1);
+ ip4_len0 = clib_host_to_net_u16(ip40->length);
+ ip4_len1 = clib_host_to_net_u16(ip41->length);
+
+ if (PREDICT_FALSE(p0->current_length < ip4_len0 ||
+ ip40->ip_version_and_header_length != 0x45)) {
+ error0 = MAP_ERROR_UNKNOWN;
+ next0 = IP4_MAPT_NEXT_DROP;
+ }
+
+ if (PREDICT_FALSE(p1->current_length < ip4_len1 ||
+ ip41->ip_version_and_header_length != 0x45)) {
+ error1 = MAP_ERROR_UNKNOWN;
+ next1 = IP4_MAPT_NEXT_DROP;
+ }
+
+ d0 = ip4_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX],
+ &vnet_buffer(p0)->map_t.map_domain_index);
+ d1 = ip4_map_get_domain(vnet_buffer(p1)->ip.adj_index[VLIB_TX],
+ &vnet_buffer(p1)->map_t.map_domain_index);
+
+ vnet_buffer(p0)->map_t.mtu = d0->mtu ? d0->mtu : ~0;
+ vnet_buffer(p1)->map_t.mtu = d1->mtu ? d1->mtu : ~0;
+
+ dst_port0 = -1;
+ dst_port1 = -1;
+
+ ip4_map_t_classify(p0, d0, ip40, ip4_len0, &dst_port0, &error0, &next0);
+ ip4_map_t_classify(p1, d1, ip41, ip4_len1, &dst_port1, &error1, &next1);
+
+ //Add MAP-T pseudo header in front of the packet
+ vlib_buffer_advance(p0, - sizeof(*pheader0));
+ vlib_buffer_advance(p1, - sizeof(*pheader1));
+ pheader0 = vlib_buffer_get_current(p0);
+ pheader1 = vlib_buffer_get_current(p1);
+
+ //Save addresses within the packet
+ ip4_map_t_embedded_address(d0, &pheader0->saddr, &ip40->src_address);
+ ip4_map_t_embedded_address(d1, &pheader1->saddr, &ip41->src_address);
+ pheader0->daddr.as_u64[0] = map_get_pfx_net(d0, ip40->dst_address.as_u32, (u16)dst_port0);
+ pheader0->daddr.as_u64[1] = map_get_sfx_net(d0, ip40->dst_address.as_u32, (u16)dst_port0);
+ pheader1->daddr.as_u64[0] = map_get_pfx_net(d1, ip41->dst_address.as_u32, (u16)dst_port1);
+ pheader1->daddr.as_u64[1] = map_get_sfx_net(d1, ip41->dst_address.as_u32, (u16)dst_port1);
+
+ if (PREDICT_FALSE(ip4_is_first_fragment(ip40) && (dst_port0 != -1) &&
+ (d0->ea_bits_len != 0 || !d0->rules) &&
+ ip4_map_fragment_cache(ip40, dst_port0))) {
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ }
+
+ if (PREDICT_FALSE(ip4_is_first_fragment(ip41) && (dst_port1 != -1) &&
+ (d1->ea_bits_len != 0 || !d1->rules) &&
+ ip4_map_fragment_cache(ip41, dst_port1))) {
+ error1 = MAP_ERROR_FRAGMENT_MEMORY;
+ }
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip40->length));
+ }
+
+ if (PREDICT_TRUE(error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index,
+ vnet_buffer(p1)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip41->length));
+ }
+
+ next0 = (error0 != MAP_ERROR_NONE) ? IP4_MAPT_NEXT_DROP : next0;
+ next1 = (error1 != MAP_ERROR_NONE) ? IP4_MAPT_NEXT_DROP : next1;
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next,
+ n_left_to_next, pi0, pi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip4_header_t *ip40;
+ map_domain_t *d0;
+ ip4_mapt_next_t next0;
+ u16 ip4_len0;
+ u8 error0;
+ i32 dst_port0;
+ ip4_mapt_pseudo_header_t *pheader0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ error0 = MAP_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip40 = vlib_buffer_get_current(p0);
+ ip4_len0 = clib_host_to_net_u16(ip40->length);
+ if (PREDICT_FALSE(p0->current_length < ip4_len0 ||
+ ip40->ip_version_and_header_length != 0x45)) {
+ error0 = MAP_ERROR_UNKNOWN;
+ next0 = IP4_MAPT_NEXT_DROP;
+ }
+
+ d0 = ip4_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX],
+ &vnet_buffer(p0)->map_t.map_domain_index);
+
+ vnet_buffer(p0)->map_t.mtu = d0->mtu ? d0->mtu : ~0;
+
+ dst_port0 = -1;
+ ip4_map_t_classify(p0, d0, ip40, ip4_len0, &dst_port0, &error0, &next0);
+
+ //Add MAP-T pseudo header in front of the packet
+ vlib_buffer_advance(p0, - sizeof(*pheader0));
+ pheader0 = vlib_buffer_get_current(p0);
+
+ //Save addresses within the packet
+ ip4_map_t_embedded_address(d0, &pheader0->saddr, &ip40->src_address);
+ pheader0->daddr.as_u64[0] = map_get_pfx_net(d0, ip40->dst_address.as_u32, (u16)dst_port0);
+ pheader0->daddr.as_u64[1] = map_get_sfx_net(d0, ip40->dst_address.as_u32, (u16)dst_port0);
+
+ //It is important to cache at this stage because the result might be necessary
+ //for packets within the same vector.
+ //Actually, this approach even provides some limited out-of-order fragments support
+ if (PREDICT_FALSE(ip4_is_first_fragment(ip40) && (dst_port0 != -1) &&
+ (d0->ea_bits_len != 0 || !d0->rules) &&
+ ip4_map_fragment_cache(ip40, dst_port0))) {
+ error0 = MAP_ERROR_UNKNOWN;
+ }
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_TX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip40->length));
+ }
+
+ next0 = (error0 != MAP_ERROR_NONE) ? IP4_MAPT_NEXT_DROP : next0;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static char *map_t_error_strings[] = {
+#define _(sym,string) string,
+ foreach_map_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip4_map_t_fragmented_node) = {
+ .function = ip4_map_t_fragmented,
+ .name = "ip4-map-t-fragmented",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP4_MAPT_FRAGMENTED_N_NEXT,
+ .next_nodes = {
+ [IP4_MAPT_FRAGMENTED_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_MAPT_FRAGMENTED_NEXT_IP6_FRAG] = IP6_FRAG_NODE_NAME,
+ [IP4_MAPT_FRAGMENTED_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip4_map_t_icmp_node) = {
+ .function = ip4_map_t_icmp,
+ .name = "ip4-map-t-icmp",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP4_MAPT_ICMP_N_NEXT,
+ .next_nodes = {
+ [IP4_MAPT_ICMP_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_MAPT_ICMP_NEXT_IP6_FRAG] = IP6_FRAG_NODE_NAME,
+ [IP4_MAPT_ICMP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip4_map_t_tcp_udp_node) = {
+ .function = ip4_map_t_tcp_udp,
+ .name = "ip4-map-t-tcp-udp",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP4_MAPT_TCP_UDP_N_NEXT,
+ .next_nodes = {
+ [IP4_MAPT_TCP_UDP_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_MAPT_TCP_UDP_NEXT_IP6_FRAG] = IP6_FRAG_NODE_NAME,
+ [IP4_MAPT_TCP_UDP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip4_map_t_node) = {
+ .function = ip4_map_t,
+ .name = "ip4-map-t",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP4_MAPT_N_NEXT,
+ .next_nodes = {
+ [IP4_MAPT_NEXT_MAPT_TCP_UDP] = "ip4-map-t-tcp-udp",
+ [IP4_MAPT_NEXT_MAPT_ICMP] = "ip4-map-t-icmp",
+ [IP4_MAPT_NEXT_MAPT_FRAGMENTED] = "ip4-map-t-fragmented",
+ [IP4_MAPT_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/ip4_sixrd.c b/vnet/vnet/map/ip4_sixrd.c
new file mode 100644
index 00000000000..1e83ce831e0
--- /dev/null
+++ b/vnet/vnet/map/ip4_sixrd.c
@@ -0,0 +1,127 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#include "sixrd.h"
+
+vlib_node_registration_t ip4_sixrd_node;
+
+typedef enum {
+ IP4_SIXRD_NEXT_IP6_LOOKUP,
+ IP4_SIXRD_NEXT_DROP,
+ IP4_SIXRD_N_NEXT,
+} ip4_sixrd_next_t;
+
+/*
+ * ip4_sixrd_sec_check
+ */
+static_always_inline void
+ip4_sixrd_sec_check (sixrd_domain_t *d, ip4_address_t sa4, ip6_address_t sa6, u8 *error)
+{
+ u32 a = sixrd_get_addr(d, sa6.as_u64[0]);
+ clib_warning("Security check: %U %U", format_ip4_address, &a, format_ip4_address, &sa4);
+ if (PREDICT_FALSE(sixrd_get_addr(d, sa6.as_u64[0]) != sa4.as_u32))
+ *error = SIXRD_ERROR_SEC_CHECK;
+}
+
+/*
+ * ip4_sixrd
+ */
+static uword
+ip4_sixrd (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip4_sixrd_node.index);
+ u32 decap = 0;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Single loop */
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0 = SIXRD_ERROR_NONE;
+ sixrd_domain_t *d0 = 0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ u32 sixrd_domain_index0 = ~0;
+ u32 next0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip40 = vlib_buffer_get_current(p0);
+
+ /* Throw away anything that isn't IP in IP. */
+ if (PREDICT_TRUE(ip40->protocol == IP_PROTOCOL_IPV6 && clib_net_to_host_u16(ip40->length) >= 60)) {
+ vlib_buffer_advance(p0, sizeof(ip4_header_t));
+ ip60 = vlib_buffer_get_current(p0);
+ d0 = ip4_sixrd_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], (ip6_address_t *)&ip60->src_address,
+ &sixrd_domain_index0, &error0);
+ } else {
+ error0 = SIXRD_ERROR_BAD_PROTOCOL;
+ }
+ if (d0) {
+ /* SIXRD inbound security check */
+ ip4_sixrd_sec_check(d0, ip40->src_address, ip60->src_address, &error0);
+ }
+
+ next0 = error0 == SIXRD_ERROR_NONE ? IP4_SIXRD_NEXT_IP6_LOOKUP : IP4_SIXRD_NEXT_DROP;
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ sixrd_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->sixrd_domain_index = sixrd_domain_index0;
+ }
+
+ p0->error = error_node->errors[error0];
+ if (PREDICT_TRUE(error0 == SIXRD_ERROR_NONE)) decap++;
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter(vm, ip4_sixrd_node.index, SIXRD_ERROR_DECAPSULATED, decap);
+
+ return frame->n_vectors;
+}
+
+static char *sixrd_error_strings[] = {
+#define _(sym,string) string,
+ foreach_sixrd_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip4_sixrd_node) = {
+ .function = ip4_sixrd,
+ .name = "ip4-sixrd",
+ .vector_size = sizeof(u32),
+ .format_trace = format_sixrd_trace,
+ .n_errors = SIXRD_N_ERROR,
+ .error_strings = sixrd_error_strings,
+ .n_next_nodes = IP4_SIXRD_N_NEXT,
+ .next_nodes = {
+ [IP4_SIXRD_NEXT_IP6_LOOKUP] = "ip6-lookup",
+ [IP4_SIXRD_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/ip6_map.c b/vnet/vnet/map/ip6_map.c
new file mode 100644
index 00000000000..e803af9007a
--- /dev/null
+++ b/vnet/vnet/map/ip6_map.c
@@ -0,0 +1,966 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "map.h"
+
+#include "../ip/ip_frag.h"
+
+enum ip6_map_next_e {
+ IP6_MAP_NEXT_IP4_LOOKUP,
+#ifdef MAP_SKIP_IP6_LOOKUP
+ IP6_MAP_NEXT_IP4_REWRITE,
+#endif
+ IP6_MAP_NEXT_IP6_REASS,
+ IP6_MAP_NEXT_IP4_REASS,
+ IP6_MAP_NEXT_IP4_FRAGMENT,
+ IP6_MAP_NEXT_IP6_ICMP_RELAY,
+ IP6_MAP_NEXT_IP6_LOCAL,
+ IP6_MAP_NEXT_DROP,
+ IP6_MAP_N_NEXT,
+};
+
+enum ip6_map_ip6_reass_next_e {
+ IP6_MAP_IP6_REASS_NEXT_IP6_MAP,
+ IP6_MAP_IP6_REASS_NEXT_DROP,
+ IP6_MAP_IP6_REASS_N_NEXT,
+};
+
+enum ip6_map_ip4_reass_next_e {
+ IP6_MAP_IP4_REASS_NEXT_IP4_LOOKUP,
+ IP6_MAP_IP4_REASS_NEXT_IP4_FRAGMENT,
+ IP6_MAP_IP4_REASS_NEXT_DROP,
+ IP6_MAP_IP4_REASS_N_NEXT,
+};
+
+enum ip6_icmp_relay_next_e {
+ IP6_ICMP_RELAY_NEXT_IP4_LOOKUP,
+ IP6_ICMP_RELAY_NEXT_DROP,
+ IP6_ICMP_RELAY_N_NEXT,
+};
+
+vlib_node_registration_t ip6_map_ip4_reass_node;
+vlib_node_registration_t ip6_map_ip6_reass_node;
+static vlib_node_registration_t ip6_map_icmp_relay_node;
+
+typedef struct {
+ u32 map_domain_index;
+ u16 port;
+ u8 cached;
+} map_ip6_map_ip4_reass_trace_t;
+
+u8 *
+format_ip6_map_ip4_reass_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *);
+ map_ip6_map_ip4_reass_trace_t *t = va_arg (*args, map_ip6_map_ip4_reass_trace_t *);
+ return format(s, "MAP domain index: %d L4 port: %u Status: %s", t->map_domain_index,
+ t->port, t->cached?"cached":"forwarded");
+}
+
+typedef struct {
+ u16 offset;
+ u16 frag_len;
+ u8 out;
+} map_ip6_map_ip6_reass_trace_t;
+
+u8 *
+format_ip6_map_ip6_reass_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *);
+ map_ip6_map_ip6_reass_trace_t *t = va_arg (*args, map_ip6_map_ip6_reass_trace_t *);
+ return format(s, "Offset: %d Fragment length: %d Status: %s", t->offset, t->frag_len, t->out?"out":"in");
+}
+
+/*
+ * ip6_map_sec_check
+ */
+static_always_inline bool
+ip6_map_sec_check (map_domain_t *d, u16 port, ip4_header_t *ip4, ip6_header_t *ip6)
+{
+ u16 sp4 = clib_net_to_host_u16(port);
+ u32 sa4 = clib_net_to_host_u32(ip4->src_address.as_u32);
+ u64 sal6 = map_get_pfx(d, sa4, sp4);
+ u64 sar6 = map_get_sfx(d, sa4, sp4);
+
+ if (PREDICT_FALSE(sal6 != clib_net_to_host_u64(ip6->src_address.as_u64[0]) ||
+ sar6 != clib_net_to_host_u64(ip6->src_address.as_u64[1])))
+ return (false);
+ return (true);
+}
+
+static_always_inline void
+ip6_map_security_check (map_domain_t *d, ip4_header_t *ip4, ip6_header_t *ip6, u32 *next, u8 *error)
+{
+ map_main_t *mm = &map_main;
+ if (d->ea_bits_len || d->rules) {
+ if (d->psid_length > 0) {
+ if (!ip4_is_fragment(ip4)) {
+ u16 port = ip4_map_get_port(ip4, MAP_SENDER);
+ if (port) {
+ if (mm->sec_check)
+ *error = ip6_map_sec_check(d, port, ip4, ip6) ? MAP_ERROR_NONE : MAP_ERROR_DECAP_SEC_CHECK;
+ } else {
+ *error = MAP_ERROR_BAD_PROTOCOL;
+ }
+ } else {
+ *next = mm->sec_check_frag ? IP6_MAP_NEXT_IP4_REASS : *next;
+ }
+ }
+ }
+}
+
+static_always_inline bool
+ip6_map_ip4_lookup_bypass (vlib_buffer_t *p0, ip4_header_t *ip)
+{
+#ifdef MAP_SKIP_IP6_LOOKUP
+ map_main_t *mm = &map_main;
+ u32 adj_index0 = mm->adj4_index;
+ if (adj_index0 > 0) {
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm4, mm->adj4_index);
+ if (adj->n_adj > 1) {
+ u32 hash_c0 = ip4_compute_flow_hash(ip, IP_FLOW_HASH_DEFAULT);
+ adj_index0 += (hash_c0 & (adj->n_adj - 1));
+ }
+ vnet_buffer(p0)->ip.adj_index[VLIB_TX] = adj_index0;
+ return (true);
+ }
+#endif
+ return (false);
+}
+
+
+/*
+ * ip6_map
+ */
+static uword
+ip6_map (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_map_node.index);
+ map_main_t *mm = &map_main;
+ vlib_combined_counter_main_t *cm = mm->domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Dual loop */
+ while (n_left_from > 4 && n_left_to_next > 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ u8 error0 = MAP_ERROR_NONE;
+ u8 error1 = MAP_ERROR_NONE;
+ map_domain_t *d0 = 0, *d1 = 0;
+ ip4_header_t *ip40, *ip41;
+ ip6_header_t *ip60, *ip61;
+ u16 port0 = 0, port1 = 0;
+ u32 map_domain_index0 = ~0, map_domain_index1 = ~0;
+ u32 next0 = IP6_MAP_NEXT_IP4_LOOKUP;
+ u32 next1 = IP6_MAP_NEXT_IP4_LOOKUP;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t *p2, *p3;
+
+ p2 = vlib_get_buffer(vm, from[2]);
+ p3 = vlib_get_buffer(vm, from[3]);
+
+ vlib_prefetch_buffer_header(p2, LOAD);
+ vlib_prefetch_buffer_header(p3, LOAD);
+
+ /* IPv6 + IPv4 header + 8 bytes of ULP */
+ CLIB_PREFETCH(p2->data, 68, LOAD);
+ CLIB_PREFETCH(p3->data, 68, LOAD);
+ }
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next +=2;
+ n_left_to_next -= 2;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip60 = vlib_buffer_get_current(p0);
+ ip61 = vlib_buffer_get_current(p1);
+ vlib_buffer_advance(p0, sizeof(ip6_header_t));
+ vlib_buffer_advance(p1, sizeof(ip6_header_t));
+ ip40 = vlib_buffer_get_current(p0);
+ ip41 = vlib_buffer_get_current(p1);
+
+ /*
+ * Encapsulated IPv4 packet
+ * - IPv4 fragmented -> Pass to virtual reassembly unless security check disabled
+ * - Lookup/Rewrite or Fragment node in case of packet > MTU
+ * Fragmented IPv6 packet
+ * ICMP IPv6 packet
+ * - Error -> Pass to ICMPv6/ICMPv4 relay
+ * - Info -> Pass to IPv6 local
+ * Anything else -> drop
+ */
+ if (PREDICT_TRUE(ip60->protocol == IP_PROTOCOL_IP_IN_IP && clib_net_to_host_u16(ip60->payload_length) > 20)) {
+ d0 = ip6_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], (ip4_address_t *)&ip40->src_address.as_u32,
+ &map_domain_index0, &error0);
+ } else if (ip60->protocol == IP_PROTOCOL_ICMP6 &&
+ clib_net_to_host_u16(ip60->payload_length) > sizeof(icmp46_header_t)) {
+ icmp46_header_t *icmp = (void *)(ip60 + 1);
+ next0 = (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply) ?
+ IP6_MAP_NEXT_IP6_LOCAL : IP6_MAP_NEXT_IP6_ICMP_RELAY;
+ } else if (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION) {
+ next0 = IP6_MAP_NEXT_IP6_REASS;
+ } else {
+ error0 = MAP_ERROR_BAD_PROTOCOL;
+ next0 = IP6_MAP_NEXT_DROP;
+ }
+ if (PREDICT_TRUE(ip61->protocol == IP_PROTOCOL_IP_IN_IP && clib_net_to_host_u16(ip61->payload_length) > 20)) {
+ d1 = ip6_map_get_domain(vnet_buffer(p1)->ip.adj_index[VLIB_TX], (ip4_address_t *)&ip41->src_address.as_u32,
+ &map_domain_index1, &error1);
+ } else if (ip61->protocol == IP_PROTOCOL_ICMP6 &&
+ clib_net_to_host_u16(ip61->payload_length) > sizeof(icmp46_header_t)) {
+ icmp46_header_t *icmp = (void *)(ip61 + 1);
+ next1 = (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply) ?
+ IP6_MAP_NEXT_IP6_LOCAL : IP6_MAP_NEXT_IP6_ICMP_RELAY;
+ } else if (ip61->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION) {
+ next1 = IP6_MAP_NEXT_IP6_REASS;
+ } else {
+ error1 = MAP_ERROR_BAD_PROTOCOL;
+ next1 = IP6_MAP_NEXT_DROP;
+ }
+
+ if (d0) {
+ /* MAP inbound security check */
+ ip6_map_security_check(d0, ip40, ip60, &next0, &error0);
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE &&
+ next0 == IP6_MAP_NEXT_IP4_LOOKUP)) {
+ if (PREDICT_FALSE(d0->mtu && (clib_host_to_net_u16(ip40->length) > d0->mtu))) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.flags = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
+ } else {
+ next0 = ip6_map_ip4_lookup_bypass(p0, ip40) ? IP6_MAP_NEXT_IP4_REWRITE : next0;
+ }
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip40->length));
+ }
+ }
+ if (d1) {
+ /* MAP inbound security check */
+ ip6_map_security_check(d1, ip41, ip61, &next1, &error1);
+
+ if (PREDICT_TRUE(error1 == MAP_ERROR_NONE &&
+ next1 == IP6_MAP_NEXT_IP4_LOOKUP)) {
+ if (PREDICT_FALSE(d1->mtu && (clib_host_to_net_u16(ip41->length) > d1->mtu))) {
+ vnet_buffer(p1)->ip_frag.header_offset = 0;
+ vnet_buffer(p1)->ip_frag.flags = 0;
+ vnet_buffer(p1)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ vnet_buffer(p1)->ip_frag.mtu = d0->mtu;
+ next1 = IP6_MAP_NEXT_IP4_FRAGMENT;
+ } else {
+ next1 = ip6_map_ip4_lookup_bypass(p1, ip41) ? IP6_MAP_NEXT_IP4_REWRITE : next1;
+ }
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index, map_domain_index1, 1,
+ clib_net_to_host_u16(ip41->length));
+ }
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = port0;
+ }
+
+ if (PREDICT_FALSE(p1->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p1, sizeof(*tr));
+ tr->map_domain_index = map_domain_index1;
+ tr->port = port1;
+ }
+
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, n_left_to_next, pi0, pi1, next0, next1);
+ }
+
+ /* Single loop */
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0 = MAP_ERROR_NONE;
+ map_domain_t *d0 = 0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ i32 port0 = 0;
+ u32 map_domain_index0 = ~0;
+ u32 next0 = IP6_MAP_NEXT_IP4_LOOKUP;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ vlib_buffer_advance(p0, sizeof(ip6_header_t));
+ ip40 = vlib_buffer_get_current(p0);
+
+ /*
+ * Encapsulated IPv4 packet
+ * - IPv4 fragmented -> Pass to virtual reassembly unless security check disabled
+ * - Lookup/Rewrite or Fragment node in case of packet > MTU
+ * Fragmented IPv6 packet
+ * ICMP IPv6 packet
+ * - Error -> Pass to ICMPv6/ICMPv4 relay
+ * - Info -> Pass to IPv6 local
+ * Anything else -> drop
+ */
+ if (PREDICT_TRUE(ip60->protocol == IP_PROTOCOL_IP_IN_IP && clib_net_to_host_u16(ip60->payload_length) > 20)) {
+ d0 = ip6_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], (ip4_address_t *)&ip40->src_address.as_u32,
+ &map_domain_index0, &error0);
+ } else if (ip60->protocol == IP_PROTOCOL_ICMP6 &&
+ clib_net_to_host_u16(ip60->payload_length) > sizeof(icmp46_header_t)) {
+ icmp46_header_t *icmp = (void *)(ip60 + 1);
+ next0 = (icmp->type == ICMP6_echo_request || icmp->type == ICMP6_echo_reply) ?
+ IP6_MAP_NEXT_IP6_LOCAL : IP6_MAP_NEXT_IP6_ICMP_RELAY;
+ } else if (ip60->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION &&
+ (((ip6_frag_hdr_t *)(ip60+1))->next_hdr == IP_PROTOCOL_IP_IN_IP)) {
+ next0 = IP6_MAP_NEXT_IP6_REASS;
+ } else {
+ error0 = MAP_ERROR_BAD_PROTOCOL;
+ }
+
+ if (d0) {
+ /* MAP inbound security check */
+ ip6_map_security_check(d0, ip40, ip60, &next0, &error0);
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE &&
+ next0 == IP6_MAP_NEXT_IP4_LOOKUP)) {
+ if (PREDICT_FALSE(d0->mtu && (clib_host_to_net_u16(ip40->length) > d0->mtu))) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.flags = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ next0 = IP6_MAP_NEXT_IP4_FRAGMENT;
+ } else {
+ next0 = ip6_map_ip4_lookup_bypass(p0, ip40) ? IP6_MAP_NEXT_IP4_REWRITE : next0;
+ }
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip40->length));
+ }
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = (u16)port0;
+ }
+
+ next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP6_MAP_NEXT_DROP;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+
+static_always_inline void
+ip6_map_ip6_reass_prepare(vlib_main_t *vm, vlib_node_runtime_t *node, map_ip6_reass_t *r,
+ u32 **fragments_ready, u32 **fragments_to_drop)
+{
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ ip6_frag_hdr_t *frag0;
+ vlib_buffer_t *p0;
+
+ if(!r->ip4_header.ip_version_and_header_length)
+ return;
+
+ //The IP header is here, we need to check for packets
+ //that can be forwarded
+ int i;
+ for (i=0; i<MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++) {
+ if (r->fragments[i].pi == ~0 ||
+ ((!r->fragments[i].next_data_len) && (r->fragments[i].next_data_offset != (0xffff))))
+ continue;
+
+ p0 = vlib_get_buffer(vm, r->fragments[i].pi);
+ ip60 = vlib_buffer_get_current(p0);
+ frag0 = (ip6_frag_hdr_t *)(ip60 + 1);
+ ip40 = (ip4_header_t *)(frag0 + 1);
+
+ if (ip6_frag_hdr_offset(frag0)) {
+ //Not first fragment, add the IPv4 header
+ memcpy(ip40, &r->ip4_header, 20);
+ }
+
+#ifdef MAP_IP6_REASS_COUNT_BYTES
+ r->forwarded += clib_net_to_host_u16(ip60->payload_length) - sizeof(*frag0);
+#endif
+
+ if (ip6_frag_hdr_more(frag0)) {
+ //Not last fragment, we copy end of next
+ memcpy(u8_ptr_add(ip60, p0->current_length), r->fragments[i].next_data, 20);
+ p0->current_length += 20;
+ ip60->payload_length = u16_net_add(ip60->payload_length, 20);
+ }
+
+ if (!ip4_is_fragment(ip40)) {
+ ip40->fragment_id = frag_id_6to4(frag0->identification);
+ ip40->flags_and_fragment_offset = clib_host_to_net_u16(ip6_frag_hdr_offset(frag0));
+ } else {
+ ip40->flags_and_fragment_offset = clib_host_to_net_u16(ip4_get_fragment_offset(ip40) + ip6_frag_hdr_offset(frag0));
+ }
+
+ if (ip6_frag_hdr_more(frag0))
+ ip40->flags_and_fragment_offset |= clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS);
+
+ ip40->length = clib_host_to_net_u16(p0->current_length - sizeof(*ip60) - sizeof(*frag0));
+ ip40->checksum = ip4_header_checksum(ip40);
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_ip6_map_ip6_reass_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->offset = ip4_get_fragment_offset(ip40);
+ tr->frag_len = clib_net_to_host_u16(ip40->length) - sizeof(*ip40);
+ tr->out = 1;
+ }
+
+ vec_add1(*fragments_ready, r->fragments[i].pi);
+ r->fragments[i].pi = ~0;
+ r->fragments[i].next_data_len = 0;
+ r->fragments[i].next_data_offset = 0;
+ map_main.ip6_reass_buffered_counter--;
+
+ //TODO: Best solution would be that ip6_map handles extension headers
+ // and ignores atomic fragment. But in the meantime, let's just copy the header.
+
+ u8 protocol = frag0->next_hdr;
+ memmove(u8_ptr_add(ip40, - sizeof(*ip60)), ip60, sizeof(*ip60));
+ ((ip6_header_t *)u8_ptr_add(ip40, - sizeof(*ip60)))->protocol = protocol;
+ vlib_buffer_advance(p0, sizeof(*frag0));
+ }
+}
+
+void
+map_ip6_drop_pi(u32 pi)
+{
+ vlib_main_t *vm = vlib_get_main();
+ vlib_node_runtime_t *n = vlib_node_get_runtime(vm, ip6_map_ip6_reass_node.index);
+ vlib_set_next_frame_buffer(vm, n, IP6_MAP_IP6_REASS_NEXT_DROP, pi);
+}
+
+void
+map_ip4_drop_pi(u32 pi)
+{
+ vlib_main_t *vm = vlib_get_main();
+ vlib_node_runtime_t *n = vlib_node_get_runtime(vm, ip6_map_ip4_reass_node.index);
+ vlib_set_next_frame_buffer(vm, n, IP6_MAP_IP4_REASS_NEXT_DROP, pi);
+}
+
+/*
+ * ip6_reass
+ * TODO: We should count the number of successfully
+ * transmitted fragment bytes and compare that to the last fragment
+ * offset such that we can free the reassembly structure when all fragments
+ * have been forwarded.
+ */
+static uword
+ip6_map_ip6_reass (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_map_ip6_reass_node.index);
+ u32 *fragments_to_drop = NULL;
+ u32 *fragments_ready = NULL;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Single loop */
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0 = MAP_ERROR_NONE;
+ ip6_header_t *ip60;
+ ip6_frag_hdr_t *frag0;
+ u16 offset;
+ u16 next_offset;
+ u16 frag_len;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ frag0 = (ip6_frag_hdr_t *)(ip60 + 1);
+ offset = clib_host_to_net_u16(frag0->fragment_offset_and_more) & (~7);
+ frag_len = clib_net_to_host_u16(ip60->payload_length) - sizeof(*frag0);
+ next_offset = ip6_frag_hdr_more(frag0) ? (offset + frag_len) : (0xffff);
+
+ //FIXME: Support other extension headers, maybe
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_ip6_map_ip6_reass_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->offset = offset;
+ tr->frag_len = frag_len;
+ tr->out = 0;
+ }
+
+ map_ip6_reass_lock();
+ map_ip6_reass_t *r = map_ip6_reass_get(&ip60->src_address, &ip60->dst_address,
+ frag0->identification, frag0->next_hdr, &fragments_to_drop);
+ //FIXME: Use better error codes
+ if (PREDICT_FALSE(!r)) {
+ // Could not create a caching entry
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ } else if (PREDICT_FALSE((frag_len <= 20 &&
+ (ip6_frag_hdr_more(frag0) || (!offset))))) {
+ //Very small fragment are restricted to the last one and
+ //can't be the first one
+ error0 = MAP_ERROR_FRAGMENT_MALFORMED;
+ } else if (map_ip6_reass_add_fragment(r, pi0, offset, next_offset, (u8 *)(frag0 + 1), frag_len)) {
+ map_ip6_reass_free(r, &fragments_to_drop);
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ } else {
+#ifdef MAP_IP6_REASS_COUNT_BYTES
+ if (!ip6_frag_hdr_more(frag0))
+ r->expected_total = offset + frag_len;
+#endif
+ ip6_map_ip6_reass_prepare(vm, node, r, &fragments_ready, &fragments_to_drop);
+#ifdef MAP_IP6_REASS_COUNT_BYTES
+ if(r->forwarded >= r->expected_total)
+ map_ip6_reass_free(r, &fragments_to_drop);
+#endif
+ }
+ map_ip6_reass_unlock();
+
+ if (error0 == MAP_ERROR_NONE) {
+ if (frag_len > 20) {
+ //Dequeue the packet
+ n_left_to_next++;
+ to_next--;
+ } else {
+ //All data from that packet was copied no need to keep it, but this is not an error
+ p0->error = error_node->errors[MAP_ERROR_NONE];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, IP6_MAP_IP6_REASS_NEXT_DROP);
+ }
+ } else {
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, IP6_MAP_IP6_REASS_NEXT_DROP);
+ }
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ map_send_all_to_node(vm, fragments_ready, node,
+ &error_node->errors[MAP_ERROR_NONE],
+ IP6_MAP_IP6_REASS_NEXT_IP6_MAP);
+ map_send_all_to_node(vm, fragments_to_drop, node,
+ &error_node->errors[MAP_ERROR_FRAGMENT_DROPPED],
+ IP6_MAP_IP6_REASS_NEXT_DROP);
+
+ vec_free(fragments_to_drop);
+ vec_free(fragments_ready);
+ return frame->n_vectors;
+}
+
+/*
+ * ip6_ip4_virt_reass
+ */
+static uword
+ip6_map_ip4_reass (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_map_ip4_reass_node.index);
+ map_main_t *mm = &map_main;
+ vlib_combined_counter_main_t *cm = mm->domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+ u32 *fragments_to_drop = NULL;
+ u32 *fragments_to_loopback = NULL;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Single loop */
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0 = MAP_ERROR_NONE;
+ map_domain_t *d0;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ i32 port0 = 0;
+ u32 map_domain_index0;
+ u32 next0 = IP6_MAP_IP4_REASS_NEXT_IP4_LOOKUP;
+ u8 cached = 0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip40 = vlib_buffer_get_current(p0);
+ ip60 = ((ip6_header_t *)ip40) - 1;
+
+ d0 = ip6_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], (ip4_address_t *)&ip40->src_address.as_u32,
+ &map_domain_index0, &error0);
+
+ map_ip4_reass_lock();
+ //This node only deals with fragmented ip4
+ map_ip4_reass_t *r = map_ip4_reass_get(ip40->src_address.as_u32, ip40->dst_address.as_u32,
+ ip40->fragment_id, ip40->protocol, &fragments_to_drop);
+ if (PREDICT_FALSE(!r)) {
+ // Could not create a caching entry
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ } else if (PREDICT_TRUE(ip4_get_fragment_offset(ip40))) {
+ // This is a fragment
+ if (r->port >= 0) {
+ // We know the port already
+ port0 = r->port;
+ } else if (map_ip4_reass_add_fragment(r, pi0)) {
+ // Not enough space for caching
+ error0 = MAP_ERROR_FRAGMENT_MEMORY;
+ map_ip4_reass_free(r, &fragments_to_drop);
+ } else {
+ cached = 1;
+ }
+ } else if ((port0 = ip4_get_port(ip40, MAP_SENDER, p0->current_length)) < 0) {
+ // Could not find port from first fragment. Stop reassembling.
+ error0 = MAP_ERROR_BAD_PROTOCOL;
+ port0 = 0;
+ map_ip4_reass_free(r, &fragments_to_drop);
+ } else {
+ // Found port. Remember it and loopback saved fragments
+ r->port = port0;
+ map_ip4_reass_get_fragments(r, &fragments_to_loopback);
+ }
+
+#ifdef MAP_IP4_REASS_COUNT_BYTES
+ if (!cached && r) {
+ r->forwarded += clib_host_to_net_u16(ip40->length) - 20;
+ if (!ip4_get_fragment_more(ip40))
+ r->expected_total = ip4_get_fragment_offset(ip40) * 8 + clib_host_to_net_u16(ip40->length) - 20;
+ if(r->forwarded >= r->expected_total)
+ map_ip4_reass_free(r, &fragments_to_drop);
+ }
+#endif
+
+ map_ip4_reass_unlock();
+
+ if(PREDICT_TRUE(error0 == MAP_ERROR_NONE))
+ error0 = ip6_map_sec_check(d0, port0, ip40, ip60) ? MAP_ERROR_NONE : MAP_ERROR_DECAP_SEC_CHECK;
+
+ if (PREDICT_FALSE(d0->mtu && (clib_host_to_net_u16(ip40->length) > d0->mtu) &&
+ error0 == MAP_ERROR_NONE && !cached)) {
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.flags = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ vnet_buffer(p0)->ip_frag.mtu = d0->mtu;
+ next0 = IP6_MAP_IP4_REASS_NEXT_IP4_FRAGMENT;
+ }
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_ip6_map_ip4_reass_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = map_domain_index0;
+ tr->port = port0;
+ tr->cached = cached;
+ }
+
+ if (cached) {
+ //Dequeue the packet
+ n_left_to_next++;
+ to_next--;
+ } else {
+ if (error0 == MAP_ERROR_NONE)
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index, map_domain_index0, 1,
+ clib_net_to_host_u16(ip40->length));
+ next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP6_MAP_IP4_REASS_NEXT_DROP;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+
+ //Loopback when we reach the end of the inpu vector
+ if(n_left_from == 0 && vec_len(fragments_to_loopback)) {
+ from = vlib_frame_vector_args(frame);
+ u32 len = vec_len(fragments_to_loopback);
+ if(len <= VLIB_FRAME_SIZE) {
+ memcpy(from, fragments_to_loopback, sizeof(u32)*len);
+ n_left_from = len;
+ vec_reset_length(fragments_to_loopback);
+ } else {
+ memcpy(from, fragments_to_loopback + (len - VLIB_FRAME_SIZE), sizeof(u32)*VLIB_FRAME_SIZE);
+ n_left_from = VLIB_FRAME_SIZE;
+ _vec_len(fragments_to_loopback) = len - VLIB_FRAME_SIZE;
+ }
+ }
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ map_send_all_to_node(vm, fragments_to_drop, node,
+ &error_node->errors[MAP_ERROR_FRAGMENT_DROPPED],
+ IP6_MAP_IP4_REASS_NEXT_DROP);
+
+ vec_free(fragments_to_drop);
+ vec_free(fragments_to_loopback);
+ return frame->n_vectors;
+}
+
+/*
+ * ip6_icmp_relay
+ */
+static uword
+ip6_map_icmp_relay (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_map_icmp_relay_node.index);
+ map_main_t *mm = &map_main;
+ u32 cpu_index = os_get_cpu_number();
+ u16 *fragment_ids, *fid;
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ /* Get random fragment IDs for replies. */
+ fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer, n_left_from * sizeof (fragment_ids[0]));
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ /* Single loop */
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0 = MAP_ERROR_NONE;
+ ip6_header_t *ip60;
+ u32 next0 = IP6_ICMP_RELAY_NEXT_IP4_LOOKUP;
+ u32 mtu;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ u16 tlen = clib_net_to_host_u16(ip60->payload_length);
+
+ /*
+ * In:
+ * IPv6 header (40)
+ * ICMPv6 header (8)
+ * IPv6 header (40)
+ * Original IPv4 header / packet
+ * Out:
+ * New IPv4 header
+ * New ICMP header
+ * Original IPv4 header / packet
+ */
+
+ /* Need at least ICMP(8) + IPv6(40) + IPv4(20) + L4 header(8) */
+ if (tlen < 76) {
+ error0 = MAP_ERROR_ICMP_RELAY;
+ goto error;
+ }
+
+ icmp46_header_t *icmp60 = (icmp46_header_t *)(ip60 + 1);
+ ip6_header_t *inner_ip60 = (ip6_header_t *)(icmp60 + 2);
+
+ if (inner_ip60->protocol != IP_PROTOCOL_IP_IN_IP) {
+ error0 = MAP_ERROR_ICMP_RELAY;
+ goto error;
+ }
+
+ ip4_header_t *inner_ip40 = (ip4_header_t *)(inner_ip60 + 1);
+ vlib_buffer_advance(p0, 60); /* sizeof ( IPv6 + ICMP + IPv6 - IPv4 - ICMP ) */
+ ip4_header_t *new_ip40 = vlib_buffer_get_current(p0);
+ icmp46_header_t *new_icmp40 = (icmp46_header_t *)(new_ip40 + 1);
+
+ /*
+ * Relay according to RFC2473, section 8.3
+ */
+ switch (icmp60->type) {
+ case ICMP6_destination_unreachable:
+ case ICMP6_time_exceeded:
+ case ICMP6_parameter_problem:
+ /* Type 3 - destination unreachable, Code 1 - host unreachable */
+ new_icmp40->type = ICMP4_destination_unreachable;
+ new_icmp40->code = ICMP4_destination_unreachable_destination_unreachable_host;
+ break;
+
+ case ICMP6_packet_too_big:
+ /* Type 3 - destination unreachable, Code 4 - packet too big */
+ /* Potential TODO: Adjust domain tunnel MTU based on the value received here */
+ mtu = clib_net_to_host_u32(*((u32 *)(icmp60 + 1)));
+
+ /* Check DF flag */
+ if (!(inner_ip40->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT))) {
+ error0 = MAP_ERROR_ICMP_RELAY;
+ goto error;
+ }
+
+ new_icmp40->type = ICMP4_destination_unreachable;
+ new_icmp40->code = ICMP4_destination_unreachable_fragmentation_needed_and_dont_fragment_set;
+ *((u32 *)(new_icmp40 + 1)) = clib_host_to_net_u32(mtu < 1280 ? 1280 : mtu);
+ break;
+
+ default:
+ error0 = MAP_ERROR_ICMP_RELAY;
+ break;
+ }
+
+ /*
+ * Ensure the total ICMP packet is no longer than 576 bytes (RFC1812)
+ */
+ new_ip40->ip_version_and_header_length = 0x45;
+ new_ip40->tos = 0;
+ u16 nlen = (tlen - 20) > 576 ? 576 : tlen - 20;
+ new_ip40->length = clib_host_to_net_u16(nlen);
+ new_ip40->fragment_id = fid[0]; fid++;
+ new_ip40->ttl = 64;
+ new_ip40->protocol = IP_PROTOCOL_ICMP;
+ new_ip40->src_address = mm->icmp_src_address;
+ new_ip40->dst_address = inner_ip40->src_address;
+ new_ip40->checksum = ip4_header_checksum(new_ip40);
+
+ new_icmp40->checksum = 0;
+ ip_csum_t sum = ip_incremental_checksum(0, new_icmp40, nlen - 20);
+ new_icmp40->checksum = ~ip_csum_fold(sum);
+
+ vlib_increment_simple_counter(&mm->icmp_relayed, cpu_index, 0, 1);
+
+ error:
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ map_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->map_domain_index = 0;
+ tr->port = 0;
+ }
+
+ next0 = (error0 == MAP_ERROR_NONE) ? next0 : IP6_ICMP_RELAY_NEXT_DROP;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+
+}
+
+static char *map_error_strings[] = {
+#define _(sym,string) string,
+ foreach_map_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip6_map_node) = {
+ .function = ip6_map,
+ .name = "ip6-map",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+
+ .n_next_nodes = IP6_MAP_N_NEXT,
+ .next_nodes = {
+ [IP6_MAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+#ifdef MAP_SKIP_IP6_LOOKUP
+ [IP6_MAP_NEXT_IP4_REWRITE] = "ip4-rewrite-transit",
+#endif
+ [IP6_MAP_NEXT_IP6_REASS] = "ip6-map-ip6-reass",
+ [IP6_MAP_NEXT_IP4_REASS] = "ip6-map-ip4-reass",
+ [IP6_MAP_NEXT_IP4_FRAGMENT] = "ip4-frag",
+ [IP6_MAP_NEXT_IP6_ICMP_RELAY] = "ip6-map-icmp-relay",
+ [IP6_MAP_NEXT_IP6_LOCAL] = "ip6-local",
+ [IP6_MAP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_ip6_reass_node) = {
+ .function = ip6_map_ip6_reass,
+ .name = "ip6-map-ip6-reass",
+ .vector_size = sizeof(u32),
+ .format_trace = format_ip6_map_ip6_reass_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+ .n_next_nodes = IP6_MAP_IP6_REASS_N_NEXT,
+ .next_nodes = {
+ [IP6_MAP_IP6_REASS_NEXT_IP6_MAP] = "ip6-map",
+ [IP6_MAP_IP6_REASS_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_ip4_reass_node) = {
+ .function = ip6_map_ip4_reass,
+ .name = "ip6-map-ip4-reass",
+ .vector_size = sizeof(u32),
+ .format_trace = format_ip6_map_ip4_reass_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+ .n_next_nodes = IP6_MAP_IP4_REASS_N_NEXT,
+ .next_nodes = {
+ [IP6_MAP_IP4_REASS_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_MAP_IP4_REASS_NEXT_IP4_FRAGMENT] = "ip4-frag",
+ [IP6_MAP_IP4_REASS_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_icmp_relay_node, static) = {
+ .function = ip6_map_icmp_relay,
+ .name = "ip6-map-icmp-relay",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace, //FIXME
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_error_strings,
+ .n_next_nodes = IP6_ICMP_RELAY_N_NEXT,
+ .next_nodes = {
+ [IP6_ICMP_RELAY_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_ICMP_RELAY_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/ip6_map_t.c b/vnet/vnet/map/ip6_map_t.c
new file mode 100644
index 00000000000..7720e06fba4
--- /dev/null
+++ b/vnet/vnet/map/ip6_map_t.c
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "map.h"
+
+#include "../ip/ip_frag.h"
+
+#define IP6_MAP_T_DUAL_LOOP
+
+typedef enum {
+ IP6_MAPT_NEXT_MAPT_TCP_UDP,
+ IP6_MAPT_NEXT_MAPT_ICMP,
+ IP6_MAPT_NEXT_MAPT_FRAGMENTED,
+ IP6_MAPT_NEXT_DROP,
+ IP6_MAPT_N_NEXT
+} ip6_mapt_next_t;
+
+typedef enum {
+ IP6_MAPT_ICMP_NEXT_IP4_LOOKUP,
+ IP6_MAPT_ICMP_NEXT_IP4_FRAG,
+ IP6_MAPT_ICMP_NEXT_DROP,
+ IP6_MAPT_ICMP_N_NEXT
+} ip6_mapt_icmp_next_t;
+
+typedef enum {
+ IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP,
+ IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG,
+ IP6_MAPT_TCP_UDP_NEXT_DROP,
+ IP6_MAPT_TCP_UDP_N_NEXT
+} ip6_mapt_tcp_udp_next_t;
+
+typedef enum {
+ IP6_MAPT_FRAGMENTED_NEXT_IP4_LOOKUP,
+ IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG,
+ IP6_MAPT_FRAGMENTED_NEXT_DROP,
+ IP6_MAPT_FRAGMENTED_N_NEXT
+} ip6_mapt_fragmented_next_t;
+
+static_always_inline int
+ip6_map_fragment_cache (ip6_header_t *ip6, ip6_frag_hdr_t *frag, map_domain_t *d, u16 port)
+{
+ u32 *ignore = NULL;
+ map_ip4_reass_lock();
+ map_ip4_reass_t *r = map_ip4_reass_get(map_get_ip4(&ip6->src_address), ip6_map_t_embedded_address(d, &ip6->dst_address),
+ frag_id_6to4(frag->identification),
+ (ip6->protocol == IP_PROTOCOL_ICMP6) ? IP_PROTOCOL_ICMP : ip6->protocol,
+ &ignore);
+ if (r)
+ r->port = port;
+
+ map_ip4_reass_unlock();
+ return !r;
+}
+
+/* Returns the associated port or -1 */
+static_always_inline i32
+ip6_map_fragment_get(ip6_header_t *ip6, ip6_frag_hdr_t *frag, map_domain_t *d)
+{
+ u32 *ignore = NULL;
+ map_ip4_reass_lock();
+ map_ip4_reass_t *r = map_ip4_reass_get(map_get_ip4(&ip6->src_address), ip6_map_t_embedded_address(d, &ip6->dst_address),
+ frag_id_6to4(frag->identification),
+ (ip6->protocol == IP_PROTOCOL_ICMP6) ? IP_PROTOCOL_ICMP : ip6->protocol,
+ &ignore);
+ i32 ret = r?r->port:-1;
+ map_ip4_reass_unlock();
+ return ret;
+}
+
+static_always_inline u8
+ip6_translate_tos(const ip6_header_t *ip6)
+{
+#ifdef IP6_MAP_T_OVERRIDE_TOS
+ return IP6_MAP_T_OVERRIDE_TOS;
+#else
+ return (clib_net_to_host_u32(ip6->ip_version_traffic_class_and_flow_label) & 0x0ff00000) >> 20;
+#endif
+}
+
+//TODO: Find right place in memory for that
+static u8 icmp6_to_icmp_updater_pointer_table[] =
+ { 0, 1,~0,~0,
+ 2, 2, 9, 8,
+ 12,12,12,12,
+ 12,12,12,12,
+ 12,12,12,12,
+ 12,12,12,12,
+ 24,24,24,24,
+ 24,24,24,24,
+ 24,24,24,24,
+ 24,24,24,24
+ };
+
+static_always_inline int
+ip6_icmp_to_icmp6_in_place (icmp46_header_t *icmp, u32 icmp_len,
+ i32 *sender_port, ip6_header_t **inner_ip6)
+{
+ *inner_ip6 = NULL;
+ switch (icmp->type) {
+ case ICMP6_echo_request:
+ *sender_port = ((u16 *)icmp)[2];
+ icmp->type = ICMP4_echo_request;
+ break;
+ case ICMP6_echo_reply:
+ *sender_port = ((u16 *)icmp)[2];
+ icmp->type = ICMP4_echo_reply;
+ break;
+ case ICMP6_destination_unreachable:
+ *inner_ip6 = (ip6_header_t *) u8_ptr_add(icmp, 8);
+ *sender_port = ip6_get_port(*inner_ip6, MAP_RECEIVER, icmp_len);
+
+ switch (icmp->code) {
+ case ICMP6_destination_unreachable_no_route_to_destination: //0
+ case ICMP6_destination_unreachable_beyond_scope_of_source_address: //2
+ case ICMP6_destination_unreachable_address_unreachable: //3
+ icmp->type = ICMP4_destination_unreachable;
+ icmp->code = ICMP4_destination_unreachable_destination_unreachable_host;
+ break;
+ case ICMP6_destination_unreachable_destination_administratively_prohibited: //1
+ icmp->type = ICMP4_destination_unreachable;
+ icmp->code = ICMP4_destination_unreachable_communication_administratively_prohibited;
+ break;
+ case ICMP6_destination_unreachable_port_unreachable:
+ icmp->type = ICMP4_destination_unreachable;
+ icmp->code = ICMP4_destination_unreachable_port_unreachable;
+ break;
+ default:
+ return -1;
+ }
+ break;
+ case ICMP6_packet_too_big:
+ *inner_ip6 = (ip6_header_t *) u8_ptr_add(icmp, 8);
+ *sender_port = ip6_get_port(*inner_ip6, MAP_RECEIVER, icmp_len);
+
+ icmp->type = ICMP4_destination_unreachable;
+ icmp->code = 4;
+ {
+ u32 advertised_mtu = clib_net_to_host_u32(*((u32 *)(icmp + 1)));
+ advertised_mtu -= 20;
+ //FIXME: = minimum(advertised MTU-20, MTU_of_IPv4_nexthop, (MTU_of_IPv6_nexthop)-20)
+ ((u16 *)(icmp))[3] = clib_host_to_net_u16(advertised_mtu);
+ }
+ break;
+
+ case ICMP6_time_exceeded:
+ *inner_ip6 = (ip6_header_t *) u8_ptr_add(icmp, 8);
+ *sender_port = ip6_get_port(*inner_ip6, MAP_RECEIVER, icmp_len);
+
+ icmp->type = ICMP4_time_exceeded;
+ break;
+
+ case ICMP6_parameter_problem:
+ *inner_ip6 = (ip6_header_t *) u8_ptr_add(icmp, 8);
+ *sender_port = ip6_get_port(*inner_ip6, MAP_RECEIVER, icmp_len);
+
+ switch (icmp->code) {
+ case ICMP6_parameter_problem_erroneous_header_field:
+ icmp->type = ICMP4_parameter_problem;
+ icmp->code = ICMP4_parameter_problem_pointer_indicates_error;
+ u32 pointer = clib_net_to_host_u32(*((u32*)(icmp + 1)));
+ if (pointer >= 40)
+ return -1;
+
+ ((u8*)(icmp + 1))[0] = icmp6_to_icmp_updater_pointer_table[pointer];
+ break;
+ case ICMP6_parameter_problem_unrecognized_next_header:
+ icmp->type = ICMP4_destination_unreachable;
+ icmp->code = ICMP4_destination_unreachable_port_unreachable;
+ break;
+ case ICMP6_parameter_problem_unrecognized_option:
+ default:
+ return -1;
+ }
+ break;
+ default:
+ return -1;
+ break;
+ }
+ return 0;
+}
+
+static_always_inline void
+_ip6_map_t_icmp (map_domain_t *d, vlib_buffer_t *p, u8 *error)
+{
+ ip6_header_t *ip6, *inner_ip6;
+ ip4_header_t *ip4, *inner_ip4;
+ u32 ip6_pay_len;
+ icmp46_header_t *icmp;
+ i32 sender_port;
+ ip_csum_t csum;
+ u32 ip4_sadr, inner_ip4_dadr;
+
+ ip6 = vlib_buffer_get_current(p);
+ ip6_pay_len = clib_net_to_host_u16(ip6->payload_length);
+ icmp = (icmp46_header_t *)(ip6 + 1);
+ ASSERT(ip6_pay_len + sizeof(*ip6) <= p->current_length);
+
+ if (ip6->protocol != IP_PROTOCOL_ICMP6) {
+ //No extensions headers allowed here
+ //TODO: SR header
+ *error = MAP_ERROR_MALFORMED;
+ return;
+ }
+
+ //There are no fragmented ICMP messages, so no extension header for now
+
+ if (ip6_icmp_to_icmp6_in_place(icmp, ip6_pay_len, &sender_port, &inner_ip6)) {
+ //TODO: In case of 1:1 mapping it is not necessary to have the sender port
+ *error = MAP_ERROR_ICMP;
+ return;
+ }
+
+ if (sender_port < 0) {
+ // In case of 1:1 mapping, we don't care about the port
+ if(d->ea_bits_len == 0 && d->rules) {
+ sender_port = 0;
+ } else {
+ *error = MAP_ERROR_ICMP;
+ return;
+ }
+ }
+
+ //Security check
+ //Note that this prevents an intermediate IPv6 router from answering the request
+ ip4_sadr = map_get_ip4(&ip6->src_address);
+ if (ip6->src_address.as_u64[0] != map_get_pfx_net(d, ip4_sadr, sender_port) ||
+ ip6->src_address.as_u64[1] != map_get_sfx_net(d, ip4_sadr, sender_port)) {
+ *error = MAP_ERROR_SEC_CHECK;
+ return;
+ }
+
+ if (inner_ip6) {
+ u16 *inner_L4_checksum, inner_l4_offset, inner_frag_offset, inner_frag_id;
+ u8 *inner_l4, inner_protocol;
+
+ //We have two headers to translate
+ // FROM
+ // [ IPv6 ]<- ext ->[IC][ IPv6 ]<- ext ->[L4 header ...
+ // Handled cases:
+ // [ IPv6 ][IC][ IPv6 ][L4 header ...
+ // [ IPv6 ][IC][ IPv6 ][Fr][L4 header ...
+ // TO
+ // [ IPv4][IC][ IPv4][L4 header ...
+
+ //TODO: This was already done deep in ip6_icmp_to_icmp6_in_place
+ //We shouldn't have to do it again
+ if (ip6_parse(inner_ip6, ip6_pay_len - 8,
+ &inner_protocol, &inner_l4_offset, &inner_frag_offset)) {
+ *error = MAP_ERROR_MALFORMED;
+ return;
+ }
+
+ inner_l4 = u8_ptr_add(inner_ip6, inner_l4_offset);
+ inner_ip4 = (ip4_header_t *) u8_ptr_add(inner_l4, - sizeof(*inner_ip4));
+ if (inner_frag_offset) {
+ ip6_frag_hdr_t *inner_frag = (ip6_frag_hdr_t *) u8_ptr_add(inner_ip6, inner_frag_offset);
+ inner_frag_id = frag_id_6to4(inner_frag->identification);
+ } else {
+ inner_frag_id = 0;
+ }
+
+ //Do the translation of the inner packet
+ if (inner_protocol == IP_PROTOCOL_TCP) {
+ inner_L4_checksum = (u16 *) u8_ptr_add(inner_l4, 16);
+ } else if (inner_protocol == IP_PROTOCOL_UDP) {
+ inner_L4_checksum = (u16 *) u8_ptr_add(inner_l4, 6);
+ } else if (inner_protocol == IP_PROTOCOL_ICMP6) {
+ icmp46_header_t *inner_icmp = (icmp46_header_t *) inner_l4;
+ csum = inner_icmp->checksum;
+ csum = ip_csum_sub_even(csum, *((u16 *)inner_icmp));
+ //It cannot be of a different type as ip6_icmp_to_icmp6_in_place succeeded
+ inner_icmp->type = (inner_icmp->type == ICMP6_echo_request) ?
+ ICMP4_echo_request : ICMP4_echo_reply;
+ csum = ip_csum_add_even(csum, *((u16 *)inner_icmp));
+ inner_icmp->checksum = ip_csum_fold(csum);
+ inner_protocol = IP_PROTOCOL_ICMP; //Will be copied to ip6 later
+ inner_L4_checksum = &inner_icmp->checksum;
+ } else {
+ *error = MAP_ERROR_BAD_PROTOCOL;
+ return;
+ }
+
+ csum = *inner_L4_checksum;
+ csum = ip_csum_sub_even(csum, inner_ip6->src_address.as_u64[0]);
+ csum = ip_csum_sub_even(csum, inner_ip6->src_address.as_u64[1]);
+ csum = ip_csum_sub_even(csum, inner_ip6->dst_address.as_u64[0]);
+ csum = ip_csum_sub_even(csum, inner_ip6->dst_address.as_u64[1]);
+
+ //Sanity check of the outer destination address
+ if (ip6->dst_address.as_u64[0] != inner_ip6->src_address.as_u64[0] &&
+ ip6->dst_address.as_u64[1] != inner_ip6->src_address.as_u64[1]) {
+ *error = MAP_ERROR_SEC_CHECK;
+ return;
+ }
+
+ //Security check of inner packet
+ inner_ip4_dadr = map_get_ip4(&inner_ip6->dst_address);
+ if (inner_ip6->dst_address.as_u64[0] != map_get_pfx_net(d, inner_ip4_dadr, sender_port) ||
+ inner_ip6->dst_address.as_u64[1] != map_get_sfx_net(d, inner_ip4_dadr, sender_port)) {
+ *error = MAP_ERROR_SEC_CHECK;
+ return;
+ }
+
+ inner_ip4->dst_address.as_u32 = inner_ip4_dadr;
+ inner_ip4->src_address.as_u32 = ip6_map_t_embedded_address(d, &inner_ip6->src_address);
+ inner_ip4->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ inner_ip4->tos = ip6_translate_tos(inner_ip6);
+ inner_ip4->length = u16_net_add(inner_ip6->payload_length, sizeof(*ip4) + sizeof(*ip6) -
+ inner_l4_offset);
+ inner_ip4->fragment_id = inner_frag_id;
+ inner_ip4->flags_and_fragment_offset = clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS);
+ inner_ip4->ttl = inner_ip6->hop_limit;
+ inner_ip4->protocol = inner_protocol;
+ inner_ip4->checksum = ip4_header_checksum(inner_ip4);
+
+ if (inner_ip4->protocol == IP_PROTOCOL_ICMP) {
+ //Remove remainings of the pseudo-header in the csum
+ csum = ip_csum_sub_even(csum, clib_host_to_net_u16(IP_PROTOCOL_ICMP6));
+ csum = ip_csum_sub_even(csum, inner_ip4->length - sizeof(*inner_ip4));
+ } else {
+ //Update to new pseudo-header
+ csum = ip_csum_add_even(csum, inner_ip4->src_address.as_u32);
+ csum = ip_csum_add_even(csum, inner_ip4->dst_address.as_u32);
+ }
+ *inner_L4_checksum = ip_csum_fold(csum);
+
+ //Move up icmp header
+ ip4 = (ip4_header_t *) u8_ptr_add(inner_l4, - 2 * sizeof(*ip4) - 8);
+ memcpy(u8_ptr_add(inner_l4, - sizeof(*ip4) - 8), icmp, 8);
+ icmp = (icmp46_header_t *) u8_ptr_add(inner_l4, - sizeof(*ip4) - 8);
+ } else {
+ //Only one header to translate
+ ip4 = (ip4_header_t *) u8_ptr_add(ip6, sizeof(*ip6) - sizeof(*ip4));
+ }
+ vlib_buffer_advance(p, (u32) (((u8 *)ip4) - ((u8 *)ip6)));
+
+ ip4->dst_address.as_u32 = ip6_map_t_embedded_address(d, &ip6->dst_address);
+ ip4->src_address.as_u32 = ip4_sadr;
+ ip4->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip4->tos = ip6_translate_tos(ip6);
+ ip4->fragment_id = 0;
+ ip4->flags_and_fragment_offset = 0;
+ ip4->ttl = ip6->hop_limit;
+ ip4->protocol = IP_PROTOCOL_ICMP;
+ //TODO fix the length depending on offset length
+ ip4->length = u16_net_add(ip6->payload_length,
+ (inner_ip6 == NULL)?sizeof(*ip4):(2*sizeof(*ip4) - sizeof(*ip6)));
+ ip4->checksum = ip4_header_checksum(ip4);
+
+ //TODO: We could do an easy diff-checksum for echo requests/replies
+ //Recompute ICMP checksum
+ icmp->checksum = 0;
+ csum = ip_incremental_checksum(0, icmp, clib_net_to_host_u16(ip4->length) - sizeof(*ip4));
+ icmp->checksum = ~ip_csum_fold (csum);
+}
+
+static uword
+ip6_map_t_icmp (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_t_icmp_node.index);
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ vlib_combined_counter_main_t *cm = map_main.domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ u8 error0;
+ ip6_mapt_icmp_next_t next0;
+ map_domain_t *d0;
+ u16 len0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ error0 = MAP_ERROR_NONE;
+ next0 = IP6_MAPT_ICMP_NEXT_IP4_LOOKUP;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ len0 = clib_net_to_host_u16(((ip6_header_t *)vlib_buffer_get_current(p0))->payload_length);
+ d0 = pool_elt_at_index(map_main.domains, vnet_buffer(p0)->map_t.map_domain_index);
+ _ip6_map_t_icmp(d0, p0, &error0);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ //Send to fragmentation node if necessary
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next0 = IP6_MAPT_ICMP_NEXT_IP4_FRAG;
+ }
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ len0);
+ } else {
+ next0 = IP6_MAPT_ICMP_NEXT_DROP;
+ }
+
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static uword
+ip6_map_t_fragmented (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+#ifdef IP6_MAP_T_DUAL_LOOP
+ while(n_left_from >= 4 && n_left_to_next >= 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ ip6_header_t *ip60, *ip61;
+ ip6_frag_hdr_t *frag0, *frag1;
+ ip4_header_t *ip40, *ip41;
+ u16 frag_id0, frag_offset0,
+ frag_id1, frag_offset1;
+ u8 frag_more0, frag_more1;
+ ip6_mapt_fragmented_next_t next0, next1;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+ next1 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip60 = vlib_buffer_get_current(p0);
+ ip61 = vlib_buffer_get_current(p1);
+ frag0 = (ip6_frag_hdr_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+ frag1 = (ip6_frag_hdr_t *)u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.frag_offset);
+ ip40 = (ip4_header_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ ip41 = (ip4_header_t *)u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p0, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p1, vnet_buffer(p1)->map_t.v6.l4_offset - sizeof(*ip40));
+
+ frag_id0 = frag_id_6to4(frag0->identification);
+ frag_id1 = frag_id_6to4(frag1->identification);
+ frag_more0 = ip6_frag_hdr_more(frag0);
+ frag_more1 = ip6_frag_hdr_more(frag1);
+ frag_offset0 = ip6_frag_hdr_offset(frag0);
+ frag_offset1 = ip6_frag_hdr_offset(frag1);
+
+ ip40->dst_address.as_u32 = vnet_buffer(p0)->map_t.v6.daddr;
+ ip41->dst_address.as_u32 = vnet_buffer(p1)->map_t.v6.daddr;
+ ip40->src_address.as_u32 = vnet_buffer(p0)->map_t.v6.saddr;
+ ip41->src_address.as_u32 = vnet_buffer(p1)->map_t.v6.saddr;
+ ip40->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip41->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip40->tos = ip6_translate_tos(ip60);
+ ip41->tos = ip6_translate_tos(ip61);
+ ip40->length = u16_net_add(ip60->payload_length,
+ sizeof(*ip40) - vnet_buffer(p0)->map_t.v6.l4_offset + sizeof(*ip60));
+ ip41->length = u16_net_add(ip61->payload_length,
+ sizeof(*ip40) - vnet_buffer(p1)->map_t.v6.l4_offset + sizeof(*ip60));
+ ip40->fragment_id = frag_id0;
+ ip41->fragment_id = frag_id1;
+ ip40->flags_and_fragment_offset =
+ clib_host_to_net_u16(frag_offset0 | (frag_more0?IP4_HEADER_FLAG_MORE_FRAGMENTS:0));
+ ip41->flags_and_fragment_offset =
+ clib_host_to_net_u16(frag_offset1 | (frag_more1?IP4_HEADER_FLAG_MORE_FRAGMENTS:0));
+ ip40->ttl = ip60->hop_limit;
+ ip41->ttl = ip61->hop_limit;
+ ip40->protocol = (vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_ICMP6)?
+ IP_PROTOCOL_ICMP:vnet_buffer(p0)->map_t.v6.l4_protocol;
+ ip41->protocol = (vnet_buffer(p1)->map_t.v6.l4_protocol == IP_PROTOCOL_ICMP6)?
+ IP_PROTOCOL_ICMP:vnet_buffer(p1)->map_t.v6.l4_protocol;
+ ip40->checksum = ip4_header_checksum(ip40);
+ ip41->checksum = ip4_header_checksum(ip41);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next0 = IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG;
+ }
+
+ if(vnet_buffer(p1)->map_t.mtu < p1->current_length) {
+ vnet_buffer(p1)->ip_frag.mtu = vnet_buffer(p1)->map_t.mtu;
+ vnet_buffer(p1)->ip_frag.header_offset = 0;
+ vnet_buffer(p1)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next1 = IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index,
+ to_next, n_left_to_next, pi0, pi1,
+ next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip6_header_t *ip60;
+ ip6_frag_hdr_t *frag0;
+ ip4_header_t *ip40;
+ u16 frag_id0;
+ u8 frag_more0;
+ u16 frag_offset0;
+ ip6_mapt_fragmented_next_t next0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ frag0 = (ip6_frag_hdr_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+ ip40 = (ip4_header_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p0, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+
+ frag_id0 = frag_id_6to4(frag0->identification);
+ frag_more0 = ip6_frag_hdr_more(frag0);
+ frag_offset0 = ip6_frag_hdr_offset(frag0);
+
+ ip40->dst_address.as_u32 = vnet_buffer(p0)->map_t.v6.daddr;
+ ip40->src_address.as_u32 = vnet_buffer(p0)->map_t.v6.saddr;
+ ip40->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip40->tos = ip6_translate_tos(ip60);
+ ip40->length = u16_net_add(ip60->payload_length,
+ sizeof(*ip40) - vnet_buffer(p0)->map_t.v6.l4_offset + sizeof(*ip60));
+ ip40->fragment_id = frag_id0;
+ ip40->flags_and_fragment_offset =
+ clib_host_to_net_u16(frag_offset0 | (frag_more0?IP4_HEADER_FLAG_MORE_FRAGMENTS:0));
+ ip40->ttl = ip60->hop_limit;
+ ip40->protocol = (vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_ICMP6)?
+ IP_PROTOCOL_ICMP:vnet_buffer(p0)->map_t.v6.l4_protocol;
+ ip40->checksum = ip4_header_checksum(ip40);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ //Send to fragmentation node if necessary
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next0 = IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static uword
+ip6_map_t_tcp_udp (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+#ifdef IP6_MAP_T_DUAL_LOOP
+ while(n_left_from >= 4 && n_left_to_next >= 2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ ip6_header_t *ip60, *ip61;
+ ip_csum_t csum0, csum1;
+ ip4_header_t *ip40, *ip41;
+ u16 fragment_id0, flags0, *checksum0,
+ fragment_id1, flags1, *checksum1;
+ ip6_mapt_tcp_udp_next_t next0, next1;
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+ next1 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip60 = vlib_buffer_get_current(p0);
+ ip61 = vlib_buffer_get_current(p1);
+ ip40 = (ip4_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ ip41 = (ip4_header_t *) u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p0, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p1, vnet_buffer(p1)->map_t.v6.l4_offset - sizeof(*ip40));
+ checksum0 = (u16 *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.checksum_offset);
+ checksum1 = (u16 *) u8_ptr_add(ip61, vnet_buffer(p1)->map_t.checksum_offset);
+
+ csum0 = ip_csum_sub_even(*checksum0, ip60->src_address.as_u64[0]);
+ csum1 = ip_csum_sub_even(*checksum1, ip61->src_address.as_u64[0]);
+ csum0 = ip_csum_sub_even(csum0, ip60->src_address.as_u64[1]);
+ csum1 = ip_csum_sub_even(csum1, ip61->src_address.as_u64[1]);
+ csum0 = ip_csum_sub_even(csum0, ip60->dst_address.as_u64[0]);
+ csum1 = ip_csum_sub_even(csum0, ip61->dst_address.as_u64[0]);
+ csum0 = ip_csum_sub_even(csum0, ip60->dst_address.as_u64[1]);
+ csum1 = ip_csum_sub_even(csum1, ip61->dst_address.as_u64[1]);
+ csum0 = ip_csum_add_even(csum0, vnet_buffer(p0)->map_t.v6.daddr);
+ csum1 = ip_csum_add_even(csum1, vnet_buffer(p1)->map_t.v6.daddr);
+ csum0 = ip_csum_add_even(csum0, vnet_buffer(p0)->map_t.v6.saddr);
+ csum1 = ip_csum_add_even(csum1, vnet_buffer(p1)->map_t.v6.saddr);
+ *checksum0 = ip_csum_fold(csum0);
+ *checksum1 = ip_csum_fold(csum1);
+
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset)) {
+ ip6_frag_hdr_t *hdr = (ip6_frag_hdr_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+ fragment_id0 = frag_id_6to4(hdr->identification);
+ flags0 = clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS);
+ } else {
+ fragment_id0 = 0;
+ flags0 = 0;
+ }
+
+ if (PREDICT_FALSE(vnet_buffer(p1)->map_t.v6.frag_offset)) {
+ ip6_frag_hdr_t *hdr = (ip6_frag_hdr_t *) u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.frag_offset);
+ fragment_id1 = frag_id_6to4(hdr->identification);
+ flags1 = clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS);
+ } else {
+ fragment_id1 = 0;
+ flags1 = 0;
+ }
+
+ ip40->dst_address.as_u32 = vnet_buffer(p0)->map_t.v6.daddr;
+ ip41->dst_address.as_u32 = vnet_buffer(p1)->map_t.v6.daddr;
+ ip40->src_address.as_u32 = vnet_buffer(p0)->map_t.v6.saddr;
+ ip41->src_address.as_u32 = vnet_buffer(p1)->map_t.v6.saddr;
+ ip40->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip41->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip40->tos = ip6_translate_tos(ip60);
+ ip41->tos = ip6_translate_tos(ip61);
+ ip40->length = u16_net_add(ip60->payload_length,
+ sizeof(*ip40) + sizeof(*ip60) - vnet_buffer(p0)->map_t.v6.l4_offset);
+ ip41->length = u16_net_add(ip61->payload_length,
+ sizeof(*ip40) + sizeof(*ip60) - vnet_buffer(p1)->map_t.v6.l4_offset);
+ ip40->fragment_id = fragment_id0;
+ ip41->fragment_id = fragment_id1;
+ ip40->flags_and_fragment_offset = flags0;
+ ip41->flags_and_fragment_offset = flags1;
+ ip40->ttl = ip60->hop_limit;
+ ip41->ttl = ip61->hop_limit;
+ ip40->protocol = vnet_buffer(p0)->map_t.v6.l4_protocol;
+ ip41->protocol = vnet_buffer(p1)->map_t.v6.l4_protocol;
+ ip40->checksum = ip4_header_checksum(ip40);
+ ip41->checksum = ip4_header_checksum(ip41);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG;
+ }
+
+ if(vnet_buffer(p1)->map_t.mtu < p1->current_length) {
+ vnet_buffer(p1)->ip_frag.mtu = vnet_buffer(p1)->map_t.mtu;
+ vnet_buffer(p1)->ip_frag.header_offset = 0;
+ vnet_buffer(p1)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next1 = IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next,
+ n_left_to_next, pi0, pi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip6_header_t *ip60;
+ u16 *checksum0;
+ ip_csum_t csum0;
+ ip4_header_t *ip40;
+ u16 fragment_id0;
+ u16 flags0;
+ ip6_mapt_tcp_udp_next_t next0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ ip40 = (ip4_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ vlib_buffer_advance(p0, vnet_buffer(p0)->map_t.v6.l4_offset - sizeof(*ip40));
+ checksum0 = (u16 *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.checksum_offset);
+
+ //TODO: This can probably be optimized
+ csum0 = ip_csum_sub_even(*checksum0, ip60->src_address.as_u64[0]);
+ csum0 = ip_csum_sub_even(csum0, ip60->src_address.as_u64[1]);
+ csum0 = ip_csum_sub_even(csum0, ip60->dst_address.as_u64[0]);
+ csum0 = ip_csum_sub_even(csum0, ip60->dst_address.as_u64[1]);
+ csum0 = ip_csum_add_even(csum0, vnet_buffer(p0)->map_t.v6.daddr);
+ csum0 = ip_csum_add_even(csum0, vnet_buffer(p0)->map_t.v6.saddr);
+ *checksum0 = ip_csum_fold(csum0);
+
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset)) {
+ //Only the first fragment
+ ip6_frag_hdr_t *hdr = (ip6_frag_hdr_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+ fragment_id0 = frag_id_6to4(hdr->identification);
+ flags0 = clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS);
+ } else {
+ fragment_id0 = 0;
+ flags0 = 0;
+ }
+
+ ip40->dst_address.as_u32 = vnet_buffer(p0)->map_t.v6.daddr;
+ ip40->src_address.as_u32 = vnet_buffer(p0)->map_t.v6.saddr;
+ ip40->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS;
+ ip40->tos = ip6_translate_tos(ip60);
+ ip40->length = u16_net_add(ip60->payload_length,
+ sizeof(*ip40) + sizeof(*ip60) - vnet_buffer(p0)->map_t.v6.l4_offset);
+ ip40->fragment_id = fragment_id0;
+ ip40->flags_and_fragment_offset = flags0;
+ ip40->ttl = ip60->hop_limit;
+ ip40->protocol = vnet_buffer(p0)->map_t.v6.l4_protocol;
+ ip40->checksum = ip4_header_checksum(ip40);
+
+ if(vnet_buffer(p0)->map_t.mtu < p0->current_length) {
+ //Send to fragmentation node if necessary
+ vnet_buffer(p0)->ip_frag.mtu = vnet_buffer(p0)->map_t.mtu;
+ vnet_buffer(p0)->ip_frag.header_offset = 0;
+ vnet_buffer(p0)->ip_frag.next_index = IP4_FRAG_NEXT_IP4_LOOKUP;
+ next0 = IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG;
+ }
+
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static_always_inline void
+ip6_map_t_classify(vlib_buffer_t *p0, ip6_header_t *ip60,
+ map_domain_t *d0, i32 *src_port0,
+ u8 *error0, ip6_mapt_next_t *next0,
+ u32 l4_len0, ip6_frag_hdr_t *frag0)
+{
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset &&
+ ip6_frag_hdr_offset(frag0))) {
+ *next0 = IP6_MAPT_NEXT_MAPT_FRAGMENTED;
+ if(d0->ea_bits_len == 0 && d0->rules) {
+ *src_port0 = 0;
+ } else {
+ *src_port0 = ip6_map_fragment_get(ip60, frag0, d0);
+ *error0 = (*src_port0 != -1) ? *error0 : MAP_ERROR_FRAGMENT_DROPPED;
+ }
+ } else if (PREDICT_TRUE(vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_TCP)) {
+ *error0 = l4_len0 < sizeof(tcp_header_t) ? MAP_ERROR_MALFORMED : *error0;
+ vnet_buffer(p0)->map_t.checksum_offset = vnet_buffer(p0)->map_t.v6.l4_offset + 16;
+ *next0 = IP6_MAPT_NEXT_MAPT_TCP_UDP;
+ *src_port0 = (i32) *((u16*)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset));
+ } else if (PREDICT_TRUE(vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_UDP)) {
+ *error0 = l4_len0 < sizeof(udp_header_t) ? MAP_ERROR_MALFORMED : *error0;
+ vnet_buffer(p0)->map_t.checksum_offset = vnet_buffer(p0)->map_t.v6.l4_offset + 6;
+ *next0 = IP6_MAPT_NEXT_MAPT_TCP_UDP;
+ *src_port0 = (i32) *((u16*)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset));
+ } else if (vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_ICMP6) {
+ *error0 = l4_len0 < sizeof(icmp46_header_t) ? MAP_ERROR_MALFORMED : *error0;
+ *next0 = IP6_MAPT_NEXT_MAPT_ICMP;
+ if(d0->ea_bits_len == 0 && d0->rules) {
+ *src_port0 = 0;
+ } else if (((icmp46_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset))->code == ICMP6_echo_reply ||
+ ((icmp46_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset))->code == ICMP6_echo_request) {
+ *src_port0 = (i32) *((u16 *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset + 6));
+ }
+ } else {
+ //TODO: In case of 1:1 mapping, it might be possible to do something with those packets.
+ *error0 = MAP_ERROR_BAD_PROTOCOL;
+ }
+}
+
+static uword
+ip6_map_t (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_map_t_node.index);
+ vlib_combined_counter_main_t *cm = map_main.domain_counters;
+ u32 cpu_index = os_get_cpu_number();
+
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+#ifdef IP6_MAP_T_DUAL_LOOP
+ while (n_left_from >= 4 && n_left_to_next >=2) {
+ u32 pi0, pi1;
+ vlib_buffer_t *p0, *p1;
+ ip6_header_t *ip60, *ip61;
+ u8 error0, error1;
+ ip6_mapt_next_t next0, next1;
+ u32 l4_len0, l4_len1;
+ i32 src_port0, src_port1;
+ map_domain_t *d0, *d1;
+ ip6_frag_hdr_t *frag0, *frag1;
+ u32 saddr0, saddr1;
+ next0 = next1 = 0; //Because compiler whines
+
+ pi0 = to_next[0] = from[0];
+ pi1 = to_next[1] = from[1];
+ from += 2;
+ n_left_from -= 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+
+ error0 = MAP_ERROR_NONE;
+ error1 = MAP_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ p1 = vlib_get_buffer(vm, pi1);
+ ip60 = vlib_buffer_get_current(p0);
+ ip61 = vlib_buffer_get_current(p1);
+
+ saddr0 = map_get_ip4(&ip60->src_address);
+ saddr1 = map_get_ip4(&ip61->src_address);
+ d0 = ip6_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX],
+ (ip4_address_t *)&saddr0,
+ &vnet_buffer(p0)->map_t.map_domain_index, &error0);
+ d1 = ip6_map_get_domain(vnet_buffer(p1)->ip.adj_index[VLIB_TX],
+ (ip4_address_t *)&saddr1,
+ &vnet_buffer(p1)->map_t.map_domain_index, &error1);
+
+ vnet_buffer(p0)->map_t.v6.saddr = saddr0;
+ vnet_buffer(p1)->map_t.v6.saddr = saddr1;
+ vnet_buffer(p0)->map_t.v6.daddr = ip6_map_t_embedded_address(d0, &ip60->dst_address);
+ vnet_buffer(p1)->map_t.v6.daddr = ip6_map_t_embedded_address(d1, &ip61->dst_address);
+ vnet_buffer(p0)->map_t.mtu = d0->mtu ? d0->mtu : ~0;
+ vnet_buffer(p1)->map_t.mtu = d1->mtu ? d1->mtu : ~0;
+
+ if (PREDICT_FALSE(ip6_parse(ip60, p0->current_length,
+ &(vnet_buffer(p0)->map_t.v6.l4_protocol),
+ &(vnet_buffer(p0)->map_t.v6.l4_offset),
+ &(vnet_buffer(p0)->map_t.v6.frag_offset)))) {
+ error0 = MAP_ERROR_MALFORMED;
+ next0 = IP6_MAPT_NEXT_DROP;
+ }
+
+ if (PREDICT_FALSE(ip6_parse(ip61, p1->current_length,
+ &(vnet_buffer(p1)->map_t.v6.l4_protocol),
+ &(vnet_buffer(p1)->map_t.v6.l4_offset),
+ &(vnet_buffer(p1)->map_t.v6.frag_offset)))) {
+ error1 = MAP_ERROR_MALFORMED;
+ next1 = IP6_MAPT_NEXT_DROP;
+ }
+
+ src_port0 = src_port1 = -1;
+ l4_len0 = (u32)clib_net_to_host_u16(ip60->payload_length) +
+ sizeof(*ip60) - vnet_buffer(p0)->map_t.v6.l4_offset;
+ l4_len1 = (u32)clib_net_to_host_u16(ip61->payload_length) +
+ sizeof(*ip60) - vnet_buffer(p1)->map_t.v6.l4_offset;
+ frag0 = (ip6_frag_hdr_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+ frag1 = (ip6_frag_hdr_t *) u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.frag_offset);
+
+ ip6_map_t_classify(p0, ip60, d0, &src_port0, &error0, &next0, l4_len0, frag0);
+ ip6_map_t_classify(p1, ip61, d1, &src_port1, &error1, &next1, l4_len1, frag1);
+
+ if (PREDICT_FALSE((src_port0 != -1) && (
+ ip60->src_address.as_u64[0] != map_get_pfx_net(d0, vnet_buffer(p0)->map_t.v6.saddr, src_port0) ||
+ ip60->src_address.as_u64[1] != map_get_sfx_net(d0, vnet_buffer(p0)->map_t.v6.saddr, src_port0)))) {
+ error0 = MAP_ERROR_SEC_CHECK;
+ }
+
+ if (PREDICT_FALSE((src_port1 != -1) && (
+ ip61->src_address.as_u64[0] != map_get_pfx_net(d1, vnet_buffer(p1)->map_t.v6.saddr, src_port1) ||
+ ip61->src_address.as_u64[1] != map_get_sfx_net(d1, vnet_buffer(p1)->map_t.v6.saddr, src_port1)))) {
+ error1 = MAP_ERROR_SEC_CHECK;
+ }
+
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset &&
+ !ip6_frag_hdr_offset((ip6_frag_hdr_t *)
+ u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset))) &&
+ (src_port0 != -1) && (d0->ea_bits_len != 0 || !d0->rules) && (error0 == MAP_ERROR_NONE)) {
+ ip6_map_fragment_cache(ip60,
+ (ip6_frag_hdr_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset),
+ d0, src_port0);
+ }
+
+ if (PREDICT_FALSE(vnet_buffer(p1)->map_t.v6.frag_offset &&
+ !ip6_frag_hdr_offset((ip6_frag_hdr_t *)
+ u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.frag_offset))) &&
+ (src_port1 != -1) && (d1->ea_bits_len != 0 || !d1->rules) && (error1 == MAP_ERROR_NONE)) {
+ ip6_map_fragment_cache(ip61,
+ (ip6_frag_hdr_t *)u8_ptr_add(ip61, vnet_buffer(p1)->map_t.v6.frag_offset),
+ d1, src_port1);
+ }
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip60->payload_length));
+ }
+
+ if (PREDICT_TRUE(error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index,
+ vnet_buffer(p1)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip61->payload_length));
+ }
+
+ next0 = (error0 != MAP_ERROR_NONE) ? IP6_MAPT_NEXT_DROP : next0;
+ next1 = (error1 != MAP_ERROR_NONE) ? IP6_MAPT_NEXT_DROP : next1;
+ p0->error = error_node->errors[error0];
+ p1->error = error_node->errors[error1];
+ vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, n_left_to_next, pi0, pi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ ip6_header_t *ip60;
+ u8 error0;
+ u32 l4_len0;
+ i32 src_port0;
+ map_domain_t *d0;
+ ip6_frag_hdr_t *frag0;
+ ip6_mapt_next_t next0 = 0;
+ u32 saddr;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+ error0 = MAP_ERROR_NONE;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ //Save saddr in a different variable to not overwrite ip.adj_index
+ saddr = map_get_ip4(&ip60->src_address);
+ d0 = ip6_map_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX],
+ (ip4_address_t *)&saddr,
+ &vnet_buffer(p0)->map_t.map_domain_index, &error0);
+
+ //FIXME: What if d0 is null
+ vnet_buffer(p0)->map_t.v6.saddr = saddr;
+ vnet_buffer(p0)->map_t.v6.daddr = ip6_map_t_embedded_address(d0, &ip60->dst_address);
+ vnet_buffer(p0)->map_t.mtu = d0->mtu ? d0->mtu : ~0;
+
+ if (PREDICT_FALSE(ip6_parse(ip60, p0->current_length,
+ &(vnet_buffer(p0)->map_t.v6.l4_protocol),
+ &(vnet_buffer(p0)->map_t.v6.l4_offset),
+ &(vnet_buffer(p0)->map_t.v6.frag_offset)))) {
+ error0 = MAP_ERROR_MALFORMED;
+ next0 = IP6_MAPT_NEXT_DROP;
+ }
+
+ src_port0 = -1;
+ l4_len0 = (u32)clib_net_to_host_u16(ip60->payload_length) +
+ sizeof(*ip60) - vnet_buffer(p0)->map_t.v6.l4_offset;
+ frag0 = (ip6_frag_hdr_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset);
+
+
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset &&
+ ip6_frag_hdr_offset(frag0))) {
+ src_port0 = ip6_map_fragment_get(ip60, frag0, d0);
+ error0 = (src_port0 != -1) ? error0 : MAP_ERROR_FRAGMENT_MEMORY;
+ next0 = IP6_MAPT_NEXT_MAPT_FRAGMENTED;
+ } else if (PREDICT_TRUE(vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_TCP)) {
+ error0 = l4_len0 < sizeof(tcp_header_t) ? MAP_ERROR_MALFORMED : error0;
+ vnet_buffer(p0)->map_t.checksum_offset = vnet_buffer(p0)->map_t.v6.l4_offset + 16;
+ next0 = IP6_MAPT_NEXT_MAPT_TCP_UDP;
+ src_port0 = (i32) *((u16*)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset));
+ } else if (PREDICT_TRUE(vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_UDP)) {
+ error0 = l4_len0 < sizeof(udp_header_t) ? MAP_ERROR_MALFORMED : error0;
+ vnet_buffer(p0)->map_t.checksum_offset = vnet_buffer(p0)->map_t.v6.l4_offset + 6;
+ next0 = IP6_MAPT_NEXT_MAPT_TCP_UDP;
+ src_port0 = (i32) *((u16*)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset));
+ } else if (vnet_buffer(p0)->map_t.v6.l4_protocol == IP_PROTOCOL_ICMP6) {
+ error0 = l4_len0 < sizeof(icmp46_header_t) ? MAP_ERROR_MALFORMED : error0;
+ next0 = IP6_MAPT_NEXT_MAPT_ICMP;
+ if (((icmp46_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset))->code == ICMP6_echo_reply ||
+ ((icmp46_header_t *) u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset))->code == ICMP6_echo_request)
+ src_port0 = (i32) *((u16 *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.l4_offset + 6));
+ } else {
+ //TODO: In case of 1:1 mapping, it might be possible to do something with those packets.
+ error0 = MAP_ERROR_BAD_PROTOCOL;
+ }
+
+ //Security check
+ if (PREDICT_FALSE((src_port0 != -1) && (
+ ip60->src_address.as_u64[0] != map_get_pfx_net(d0, vnet_buffer(p0)->map_t.v6.saddr, src_port0) ||
+ ip60->src_address.as_u64[1] != map_get_sfx_net(d0, vnet_buffer(p0)->map_t.v6.saddr, src_port0)))) {
+ //Security check when src_port0 is not zero (non-first fragment, UDP or TCP)
+ error0 = MAP_ERROR_SEC_CHECK;
+ }
+
+ //Fragmented first packet needs to be cached for following packets
+ if (PREDICT_FALSE(vnet_buffer(p0)->map_t.v6.frag_offset &&
+ !ip6_frag_hdr_offset((ip6_frag_hdr_t *)
+ u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset))) &&
+ (src_port0 != -1) && (d0->ea_bits_len != 0 || !d0->rules) && (error0 == MAP_ERROR_NONE)) {
+ ip6_map_fragment_cache(ip60,
+ (ip6_frag_hdr_t *)u8_ptr_add(ip60, vnet_buffer(p0)->map_t.v6.frag_offset),
+ d0, src_port0);
+ }
+
+ if (PREDICT_TRUE(error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) {
+ vlib_increment_combined_counter(cm + MAP_DOMAIN_COUNTER_RX, cpu_index,
+ vnet_buffer(p0)->map_t.map_domain_index, 1,
+ clib_net_to_host_u16(ip60->payload_length));
+ }
+
+ next0 = (error0 != MAP_ERROR_NONE) ? IP6_MAPT_NEXT_DROP : next0;
+ p0->error = error_node->errors[error0];
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index,
+ to_next, n_left_to_next, pi0,
+ next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ return frame->n_vectors;
+}
+
+static char *map_t_error_strings[] = {
+#define _(sym,string) string,
+ foreach_map_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip6_map_t_fragmented_node) = {
+ .function = ip6_map_t_fragmented,
+ .name = "ip6-map-t-fragmented",
+ .vector_size = sizeof (u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP6_MAPT_FRAGMENTED_N_NEXT,
+ .next_nodes = {
+ [IP6_MAPT_FRAGMENTED_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_MAPT_FRAGMENTED_NEXT_IP4_FRAG] = IP4_FRAG_NODE_NAME,
+ [IP6_MAPT_FRAGMENTED_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_t_icmp_node) = {
+ .function = ip6_map_t_icmp,
+ .name = "ip6-map-t-icmp",
+ .vector_size = sizeof (u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP6_MAPT_ICMP_N_NEXT,
+ .next_nodes = {
+ [IP6_MAPT_ICMP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_MAPT_ICMP_NEXT_IP4_FRAG] = IP4_FRAG_NODE_NAME,
+ [IP6_MAPT_ICMP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_t_tcp_udp_node) = {
+ .function = ip6_map_t_tcp_udp,
+ .name = "ip6-map-t-tcp-udp",
+ .vector_size = sizeof (u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP6_MAPT_TCP_UDP_N_NEXT,
+ .next_nodes = {
+ [IP6_MAPT_TCP_UDP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_MAPT_TCP_UDP_NEXT_IP4_FRAG] = IP4_FRAG_NODE_NAME,
+ [IP6_MAPT_TCP_UDP_NEXT_DROP] = "error-drop",
+ },
+};
+
+VLIB_REGISTER_NODE(ip6_map_t_node) = {
+ .function = ip6_map_t,
+ .name = "ip6-map-t",
+ .vector_size = sizeof(u32),
+ .format_trace = format_map_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = MAP_N_ERROR,
+ .error_strings = map_t_error_strings,
+
+ .n_next_nodes = IP6_MAPT_N_NEXT,
+ .next_nodes = {
+ [IP6_MAPT_NEXT_MAPT_TCP_UDP] = "ip6-map-t-tcp-udp",
+ [IP6_MAPT_NEXT_MAPT_ICMP] = "ip6-map-t-icmp",
+ [IP6_MAPT_NEXT_MAPT_FRAGMENTED] = "ip6-map-t-fragmented",
+ [IP6_MAPT_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/ip6_sixrd.c b/vnet/vnet/map/ip6_sixrd.c
new file mode 100644
index 00000000000..0bd0cf3a303
--- /dev/null
+++ b/vnet/vnet/map/ip6_sixrd.c
@@ -0,0 +1,129 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+/*
+ * Defines used for testing various optimisation schemes
+ */
+#define SIXRD_ENCAP_DUAL 0
+
+#include "sixrd.h"
+
+vlib_node_registration_t ip6_sixrd_node;
+
+typedef enum {
+ IP6_SIXRD_NEXT_IP4_LOOKUP,
+ IP6_SIXRD_NEXT_DROP,
+ IP6_SIXRD_N_NEXT,
+} ip6_sixrd_next_t;
+
+/*
+ * ip6_sixrd
+ */
+static uword
+ip6_sixrd (vlib_main_t *vm,
+ vlib_node_runtime_t *node,
+ vlib_frame_t *frame)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, ip6_sixrd_node.index);
+ u32 encap = 0;
+ from = vlib_frame_vector_args(frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0;
+ vlib_buffer_t *p0;
+ sixrd_domain_t *d0;
+ u8 error0 = SIXRD_ERROR_NONE;
+ ip6_header_t *ip60;
+ ip4_header_t *ip4h0;
+ u32 next0 = IP6_SIXRD_NEXT_IP4_LOOKUP;
+ u32 sixrd_domain_index0 = ~0;
+
+ pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next +=1;
+ n_left_to_next -= 1;
+
+ p0 = vlib_get_buffer(vm, pi0);
+ ip60 = vlib_buffer_get_current(p0);
+ // p0->current_length = clib_net_to_host_u16(ip40->length);
+ d0 = ip6_sixrd_get_domain(vnet_buffer(p0)->ip.adj_index[VLIB_TX], &sixrd_domain_index0);
+ ASSERT(d0);
+
+ /* SIXRD calc */
+ u64 dal60 = clib_net_to_host_u64(ip60->dst_address.as_u64[0]);
+ u32 da40 = sixrd_get_addr(d0, dal60);
+ u16 len = clib_net_to_host_u16(ip60->payload_length) + 60;
+ if (da40 == 0) error0 = SIXRD_ERROR_UNKNOWN;
+
+ /* construct ipv4 header */
+ vlib_buffer_advance(p0, - (sizeof(ip4_header_t)));
+ ip4h0 = vlib_buffer_get_current(p0);
+ vnet_buffer(p0)->sw_if_index[VLIB_TX] = (u32)~0;
+ ip4h0->ip_version_and_header_length = 0x45;
+ ip4h0->tos = 0;
+ ip4h0->length = clib_host_to_net_u16(len);
+ ip4h0->fragment_id = 0;
+ ip4h0->flags_and_fragment_offset = 0;
+ ip4h0->ttl = 0x40;
+ ip4h0->protocol = IP_PROTOCOL_IPV6;
+ ip4h0->src_address = d0->ip4_src;
+ ip4h0->dst_address.as_u32 = clib_host_to_net_u32(da40);
+ ip4h0->checksum = ip4_header_checksum(ip4h0);
+
+ next0 = error0 == SIXRD_ERROR_NONE ? IP6_SIXRD_NEXT_IP4_LOOKUP : IP6_SIXRD_NEXT_DROP;
+
+ if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) {
+ sixrd_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof(*tr));
+ tr->sixrd_domain_index = sixrd_domain_index0;
+ }
+
+ p0->error = error_node->errors[error0];
+ if (PREDICT_TRUE(error0 == SIXRD_ERROR_NONE)) encap++;
+
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next0);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter(vm, ip6_sixrd_node.index, SIXRD_ERROR_ENCAPSULATED, encap);
+
+ return frame->n_vectors;
+}
+
+static char *sixrd_error_strings[] = {
+#define _(sym,string) string,
+ foreach_sixrd_error
+#undef _
+};
+
+VLIB_REGISTER_NODE(ip6_sixrd_node) = {
+ .function = ip6_sixrd,
+ .name = "ip6-sixrd",
+ .vector_size = sizeof(u32),
+ .format_trace = format_sixrd_trace,
+ .n_errors = SIXRD_N_ERROR,
+ .error_strings = sixrd_error_strings,
+ .n_next_nodes = IP6_SIXRD_N_NEXT,
+ .next_nodes = {
+ [IP6_SIXRD_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [IP6_SIXRD_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/map/map.c b/vnet/vnet/map/map.c
new file mode 100644
index 00000000000..b0cab660876
--- /dev/null
+++ b/vnet/vnet/map/map.c
@@ -0,0 +1,1634 @@
+/*
+ * map.c : MAP support
+ *
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "map.h"
+
+/*
+ * This code supports the following MAP modes:
+ *
+ * Algorithmic Shared IPv4 address (ea_bits_len > 0):
+ * ea_bits_len + ip4_prefix > 32
+ * psid_length > 0, ip6_prefix < 64, ip4_prefix <= 32
+ * Algorithmic Full IPv4 address (ea_bits_len > 0):
+ * ea_bits_len + ip4_prefix = 32
+ * psid_length = 0, ip6_prefix < 64, ip4_prefix <= 32
+ * Algorithmic IPv4 prefix (ea_bits_len > 0):
+ * ea_bits_len + ip4_prefix < 32
+ * psid_length = 0, ip6_prefix < 64, ip4_prefix <= 32
+ *
+ * Independent Shared IPv4 address (ea_bits_len = 0):
+ * ip4_prefix = 32
+ * psid_length > 0
+ * Rule IPv6 address = 128, Rule PSID Set
+ * Independent Full IPv4 address (ea_bits_len = 0):
+ * ip4_prefix = 32
+ * psid_length = 0, ip6_prefix = 128
+ * Independent IPv4 prefix (ea_bits_len = 0):
+ * ip4_prefix < 32
+ * psid_length = 0, ip6_prefix = 128
+ *
+ */
+
+/*
+ * This code supports MAP-T:
+ *
+ * With DMR prefix length equal to 96.
+ *
+ */
+
+
+i32
+ip4_get_port (ip4_header_t *ip, map_dir_e dir, u16 buffer_len)
+{
+ //TODO: use buffer length
+ if (ip->ip_version_and_header_length != 0x45 ||
+ ip4_get_fragment_offset(ip))
+ return -1;
+
+ if (PREDICT_TRUE((ip->protocol == IP_PROTOCOL_TCP) ||
+ (ip->protocol == IP_PROTOCOL_UDP))) {
+ udp_header_t *udp = (void *)(ip + 1);
+ return (dir == MAP_SENDER) ? udp->src_port : udp->dst_port;
+ } else if (ip->protocol == IP_PROTOCOL_ICMP) {
+ icmp46_header_t *icmp = (void *)(ip + 1);
+ if (icmp->type == ICMP4_echo_request ||
+ icmp->type == ICMP4_echo_reply) {
+ return *((u16 *)(icmp + 1));
+ } else if (clib_net_to_host_u16(ip->length) >= 64) {
+ ip = (ip4_header_t *)(icmp + 2);
+ if (PREDICT_TRUE((ip->protocol == IP_PROTOCOL_TCP) ||
+ (ip->protocol == IP_PROTOCOL_UDP))) {
+ udp_header_t *udp = (void *)(ip + 1);
+ return (dir == MAP_SENDER) ? udp->dst_port : udp->src_port;
+ } else if (ip->protocol == IP_PROTOCOL_ICMP) {
+ icmp46_header_t *icmp = (void *)(ip + 1);
+ if (icmp->type == ICMP4_echo_request ||
+ icmp->type == ICMP4_echo_reply) {
+ return *((u16 *)(icmp + 1));
+ }
+ }
+ }
+ }
+ return -1;
+}
+
+i32
+ip6_get_port (ip6_header_t *ip6, map_dir_e dir, u16 buffer_len)
+{
+ u8 l4_protocol;
+ u16 l4_offset;
+ u16 frag_offset;
+ u8 *l4;
+
+ if (ip6_parse(ip6, buffer_len, &l4_protocol, &l4_offset, &frag_offset))
+ return -1;
+
+ //TODO: Use buffer length
+
+ if (frag_offset &&
+ ip6_frag_hdr_offset(((ip6_frag_hdr_t *)u8_ptr_add(ip6, frag_offset))))
+ return -1; //Can't deal with non-first fragment for now
+
+ l4 = u8_ptr_add(ip6, l4_offset);
+ if (l4_protocol == IP_PROTOCOL_TCP ||
+ l4_protocol == IP_PROTOCOL_UDP) {
+ return (dir == MAP_SENDER) ? ((udp_header_t *)(l4))->src_port : ((udp_header_t *)(l4))->dst_port;
+ } else if (l4_protocol == IP_PROTOCOL_ICMP6) {
+ icmp46_header_t *icmp = (icmp46_header_t *)(l4);
+ if (icmp->type == ICMP6_echo_request) {
+ return (dir == MAP_SENDER) ? ((u16*)(icmp))[2] : -1;
+ } else if (icmp->type == ICMP6_echo_reply) {
+ return (dir == MAP_SENDER) ? -1 : ((u16*)(icmp))[2];
+ }
+ }
+ return -1;
+}
+
+
+int
+map_create_domain (ip4_address_t *ip4_prefix,
+ u8 ip4_prefix_len,
+ ip6_address_t *ip6_prefix,
+ u8 ip6_prefix_len,
+ ip6_address_t *ip6_src,
+ u8 ip6_src_len,
+ u8 ea_bits_len,
+ u8 psid_offset,
+ u8 psid_length,
+ u32 *map_domain_index,
+ u16 mtu,
+ u8 flags)
+{
+ map_main_t *mm = &map_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip6_main_t *im6 = &ip6_main;
+ map_domain_t *d;
+ ip_adjacency_t adj;
+ ip4_add_del_route_args_t args4;
+ ip6_add_del_route_args_t args6;
+ u8 suffix_len;
+ uword *p;
+
+ /* EA bits must be within the first 64 bits */
+ if (ea_bits_len > 0 && (ip6_prefix_len + ea_bits_len) > 64)
+ return -1;
+
+ /* Sanity check on the src prefix length */
+ if (flags & MAP_DOMAIN_TRANSLATION) {
+ if (ip6_src_len != 96) {
+ clib_warning("MAP-T only supports ip6_src_len = 96 for now.");
+ return -1;
+ }
+ } else {
+ if (ip6_src_len != 128) {
+ clib_warning("MAP-E requires a BR address, not a prefix (ip6_src_len should be 128).");
+ return -1;
+ }
+ }
+
+ /* Get domain index */
+ pool_get_aligned(mm->domains, d, CLIB_CACHE_LINE_BYTES);
+ memset(d, 0, sizeof (*d));
+ *map_domain_index = d - mm->domains;
+
+ /* Init domain struct */
+ d->ip4_prefix.as_u32 = ip4_prefix->as_u32;
+ d->ip4_prefix_len = ip4_prefix_len;
+ d->ip6_prefix = *ip6_prefix;
+ d->ip6_prefix_len = ip6_prefix_len;
+ d->ip6_src = *ip6_src;
+ d->ip6_src_len = ip6_src_len;
+ d->ea_bits_len = ea_bits_len;
+ d->psid_offset = psid_offset;
+ d->psid_length = psid_length;
+ d->mtu = mtu;
+ d->flags = flags;
+
+ /* How many, and which bits to grab from the IPv4 DA */
+ if (ip4_prefix_len + ea_bits_len < 32) {
+ d->flags |= MAP_DOMAIN_PREFIX;
+ suffix_len = d->suffix_shift = 32 - ip4_prefix_len - ea_bits_len;
+ } else {
+ d->suffix_shift = 0;
+ suffix_len = 32 - ip4_prefix_len;
+ }
+ d->suffix_mask = (1<<suffix_len) - 1;
+
+ d->psid_shift = 16 - psid_length - psid_offset;
+ d->psid_mask = (1 << d->psid_length) - 1;
+ d->ea_shift = 64 - ip6_prefix_len - suffix_len - d->psid_length;
+
+ /* Init IP adjacency */
+ memset(&adj, 0, sizeof(adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = (d->flags & MAP_DOMAIN_TRANSLATION) ? IP_LOOKUP_NEXT_MAP_T : IP_LOOKUP_NEXT_MAP;
+ p = (uword *)&adj.rewrite_data[0];
+ *p = (uword) (*map_domain_index);
+
+ if (ip4_get_route(im4, 0, 0, (u8 *)ip4_prefix, ip4_prefix_len)) {
+ clib_warning("IPv4 route already defined: %U/%d", format_ip4_address, ip4_prefix, ip4_prefix_len);
+ pool_put(mm->domains, d);
+ return -1;
+ }
+
+ /* Create ip4 adjacency */
+ memset(&args4, 0, sizeof(args4));
+ args4.table_index_or_table_id = 0;
+ args4.flags = IP4_ROUTE_FLAG_ADD;
+ args4.dst_address.as_u32 = ip4_prefix->as_u32;
+ args4.dst_address_length = ip4_prefix_len;
+
+ args4.adj_index = ~0;
+ args4.add_adj = &adj;
+ args4.n_add_adj = 1;
+ ip4_add_del_route(im4, &args4);
+
+ /* Multiple MAP domains may share same source IPv6 TEP */
+ u32 ai = ip6_get_route(im6, 0, 0, ip6_src, ip6_src_len);
+ if (ai > 0) {
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_adjacency_t *adj6 = ip_get_adjacency(lm6, ai);
+ if (adj6->lookup_next_index != IP_LOOKUP_NEXT_MAP &&
+ adj6->lookup_next_index != IP_LOOKUP_NEXT_MAP_T) {
+ clib_warning("BR source address already assigned: %U", format_ip6_address, ip6_src);
+ pool_put(mm->domains, d);
+ return -1;
+ }
+ /* Shared source */
+ p = (uword *)&adj6->rewrite_data[0];
+ p[0] = ~0;
+
+ /* Add refcount, so we don't accidentially delete the route underneath someone */
+ p[1]++;
+ } else {
+ /* Create ip6 adjacency. */
+ memset(&args6, 0, sizeof(args6));
+ args6.table_index_or_table_id = 0;
+ args6.flags = IP6_ROUTE_FLAG_ADD;
+ args6.dst_address.as_u64[0] = ip6_src->as_u64[0];
+ args6.dst_address.as_u64[1] = ip6_src->as_u64[1];
+ args6.dst_address_length = ip6_src_len;
+ args6.adj_index = ~0;
+ args6.add_adj = &adj;
+ args6.n_add_adj = 1;
+ ip6_add_del_route(im6, &args6);
+ }
+
+ /* Validate packet/byte counters */
+ map_domain_counter_lock(mm);
+ int i;
+ for (i = 0; i < vec_len(mm->simple_domain_counters); i++) {
+ vlib_validate_simple_counter(&mm->simple_domain_counters[i], *map_domain_index);
+ vlib_zero_simple_counter(&mm->simple_domain_counters[i], *map_domain_index);
+ }
+ for (i = 0; i < vec_len(mm->domain_counters); i++) {
+ vlib_validate_combined_counter(&mm->domain_counters[i], *map_domain_index);
+ vlib_zero_combined_counter(&mm->domain_counters[i], *map_domain_index);
+ }
+ map_domain_counter_unlock(mm);
+
+ return 0;
+}
+
+/*
+ * map_delete_domain
+ */
+int
+map_delete_domain (u32 map_domain_index)
+{
+ map_main_t *mm = &map_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip6_main_t *im6 = &ip6_main;
+ map_domain_t *d;
+ ip_adjacency_t adj;
+ ip4_add_del_route_args_t args4;
+ ip6_add_del_route_args_t args6;
+
+ if (pool_is_free_index(mm->domains, map_domain_index)) {
+ clib_warning("MAP domain delete: domain does not exist: %d", map_domain_index);
+ return -1;
+ }
+
+ d = pool_elt_at_index(mm->domains, map_domain_index);
+
+ memset(&adj, 0, sizeof(adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = (d->flags & MAP_DOMAIN_TRANSLATION) ? IP_LOOKUP_NEXT_MAP_T : IP_LOOKUP_NEXT_MAP;
+
+ /* Delete ip4 adjacency */
+ memset(&args4, 0, sizeof(args4));
+ args4.table_index_or_table_id = 0;
+ args4.flags = IP4_ROUTE_FLAG_DEL;
+ args4.dst_address.as_u32 = d->ip4_prefix.as_u32;
+ args4.dst_address_length = d->ip4_prefix_len;
+ args4.adj_index = 0;
+ args4.add_adj = &adj;
+ args4.n_add_adj = 0;
+ ip4_add_del_route(im4, &args4);
+
+ /* Delete ip6 adjacency */
+ u32 ai = ip6_get_route(im6, 0, 0, &d->ip6_src, d->ip6_src_len);
+ if (ai > 0) {
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_adjacency_t *adj6 = ip_get_adjacency(lm6, ai);
+
+ uword *p = (uword *)&adj6->rewrite_data[0];
+ /* Delete route when no other domains use this source */
+ if (p[1] == 0) {
+ memset(&args6, 0, sizeof (args6));
+ args6.table_index_or_table_id = 0;
+ args6.flags = IP6_ROUTE_FLAG_DEL;
+ args6.dst_address.as_u64[0] = d->ip6_src.as_u64[0];
+ args6.dst_address.as_u64[1] = d->ip6_src.as_u64[1];
+ args6.dst_address_length = d->ip6_src_len;
+ args6.adj_index = 0;
+ args6.add_adj = &adj;
+ args6.n_add_adj = 0;
+ ip6_add_del_route(im6, &args6);
+ }
+ p[1]--;
+ }
+ /* Deleting rules */
+ if (d->rules)
+ clib_mem_free(d->rules);
+
+ pool_put(mm->domains, d);
+
+ return 0;
+}
+
+int
+map_add_del_psid (u32 map_domain_index, u16 psid, ip6_address_t *tep,
+ u8 is_add)
+{
+ map_domain_t *d;
+ map_main_t *mm = &map_main;
+
+ if (pool_is_free_index(mm->domains, map_domain_index)) {
+ clib_warning("MAP rule: domain does not exist: %d", map_domain_index);
+ return -1;
+ }
+ d = pool_elt_at_index(mm->domains, map_domain_index);
+
+ /* Rules are only used in 1:1 independent case */
+ if (d->ea_bits_len > 0)
+ return (-1);
+
+ if (!d->rules) {
+ u32 l = (0x1 << d->psid_length) * sizeof(ip6_address_t);
+ d->rules = clib_mem_alloc_aligned(l, CLIB_CACHE_LINE_BYTES);
+ if (!d->rules) return -1;
+ memset(d->rules, 0, l);
+ }
+
+ if (psid >= (0x1 << d->psid_length)) {
+ clib_warning("MAP rule: PSID outside bounds: %d [%d]", psid, 0x1 << d->psid_length);
+ return -1;
+ }
+
+ if (is_add) {
+ d->rules[psid] = *tep;
+ } else {
+ memset(&d->rules[psid], 0, sizeof(ip6_address_t));
+ }
+ return 0;
+}
+
+#ifdef MAP_SKIP_IP6_LOOKUP
+static void
+map_pre_resolve (ip4_address_t *ip4, ip6_address_t *ip6)
+{
+ map_main_t *mm = &map_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip6_main_t *im6 = &ip6_main;
+
+ if (ip6->as_u64[0] != 0 || ip6->as_u64[1] != 0) {
+ mm->adj6_index = ip6_fib_lookup_with_table(im6, 0, ip6);
+ clib_warning("FIB lookup results in: %u", mm->adj6_index);
+ }
+ if (ip4->as_u32 != 0) {
+ mm->adj4_index = ip4_fib_lookup_with_table(im4, 0, ip4, 0);
+ clib_warning("FIB lookup results in: %u", mm->adj4_index);
+ }
+}
+#endif
+
+static clib_error_t *
+map_security_check_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ map_main_t *mm = &map_main;
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "off"))
+ mm->sec_check = false;
+ else if (unformat(line_input, "on"))
+ mm->sec_check = true;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+ return 0;
+}
+
+static clib_error_t *
+map_security_check_frag_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ map_main_t *mm = &map_main;
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "off"))
+ mm->sec_check_frag = false;
+ else if (unformat(line_input, "on"))
+ mm->sec_check_frag = true;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+ return 0;
+}
+
+static clib_error_t *
+map_add_domain_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ ip4_address_t ip4_prefix;
+ ip6_address_t ip6_prefix;
+ ip6_address_t ip6_src;
+ u32 ip6_prefix_len, ip4_prefix_len, map_domain_index, ip6_src_len;
+ u32 num_m_args = 0;
+ /* Optional arguments */
+ u32 ea_bits_len, psid_offset = 0, psid_length = 0;
+ u32 mtu = 0;
+ u8 flags = 0;
+ ip6_src_len = 128;
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "ip4-pfx %U/%d", unformat_ip4_address, &ip4_prefix, &ip4_prefix_len))
+ num_m_args++;
+ else if (unformat(line_input, "ip6-pfx %U/%d", unformat_ip6_address, &ip6_prefix, &ip6_prefix_len))
+ num_m_args++;
+ else if (unformat(line_input, "ip6-src %U/%d", unformat_ip6_address, &ip6_src, &ip6_src_len))
+ num_m_args++;
+ else if (unformat(line_input, "ip6-src %U", unformat_ip6_address, &ip6_src))
+ num_m_args++;
+ else if (unformat(line_input, "ea-bits-len %d", &ea_bits_len))
+ num_m_args++;
+ else if (unformat(line_input, "psid-offset %d", &psid_offset))
+ num_m_args++;
+ else if (unformat(line_input, "psid-len %d", &psid_length))
+ num_m_args++;
+ else if (unformat(line_input, "mtu %d", &mtu))
+ num_m_args++;
+ else if (unformat(line_input, "map-t"))
+ flags |= MAP_DOMAIN_TRANSLATION;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (num_m_args < 3)
+ return clib_error_return(0, "mandatory argument(s) missing");
+
+ map_create_domain(&ip4_prefix, ip4_prefix_len,
+ &ip6_prefix, ip6_prefix_len, &ip6_src, ip6_src_len,
+ ea_bits_len, psid_offset, psid_length, &map_domain_index,
+ mtu, flags);
+
+ return 0;
+}
+
+static clib_error_t *
+map_del_domain_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 num_m_args = 0;
+ u32 map_domain_index;
+
+ /* Get a line of input. */
+ if (! unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "index %d", &map_domain_index))
+ num_m_args++;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (num_m_args != 1)
+ return clib_error_return(0, "mandatory argument(s) missing");
+
+ map_delete_domain(map_domain_index);
+
+ return 0;
+}
+
+static clib_error_t *
+map_add_rule_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ ip6_address_t tep;
+ u32 num_m_args = 0;
+ u32 psid, map_domain_index;
+
+ /* Get a line of input. */
+ if (! unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "index %d", &map_domain_index))
+ num_m_args++;
+ else if (unformat(line_input, "psid %d", &psid))
+ num_m_args++;
+ else if (unformat(line_input, "ip6-dst %U", unformat_ip6_address, &tep))
+ num_m_args++;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (num_m_args != 3)
+ return clib_error_return(0, "mandatory argument(s) missing");
+
+ if (map_add_del_psid(map_domain_index, psid, &tep, 1) != 0) {
+ return clib_error_return(0, "Failing to add Mapping Rule");
+ }
+ return 0;
+}
+
+#if MAP_SKIP_IP6_LOOKUP
+static clib_error_t *
+map_pre_resolve_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ ip4_address_t ip4nh;
+ ip6_address_t ip6nh;
+ map_main_t *mm = &map_main;
+
+ memset(&ip4nh, 0, sizeof(ip4nh));
+ memset(&ip6nh, 0, sizeof(ip6nh));
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "ip4-nh %U", unformat_ip4_address, &ip4nh))
+ mm->preresolve_ip4 = ip4nh;
+ else if (unformat(line_input, "ip6-nh %U", unformat_ip6_address, &ip6nh))
+ mm->preresolve_ip6 = ip6nh;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ map_pre_resolve(&ip4nh, &ip6nh);
+
+ return 0;
+}
+#endif
+
+static clib_error_t *
+map_icmp_relay_source_address_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ ip4_address_t icmp_src_address;
+ map_main_t *mm = &map_main;
+
+ memset(&icmp_src_address, 0, sizeof(icmp_src_address));
+
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "%U", unformat_ip4_address, &icmp_src_address))
+ mm->icmp_src_address = icmp_src_address;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ return 0;
+}
+
+static clib_error_t *
+map_traffic_class_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ map_main_t *mm = &map_main;
+ u32 tc = 0;
+
+ mm->tc_copy = false;
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "copy"))
+ mm->tc_copy = true;
+ else if (unformat(line_input, "%x", &tc))
+ mm->tc = tc & 0xff;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ return 0;
+}
+
+static u8 *
+format_map_domain (u8 *s, va_list *args)
+{
+ map_domain_t *d = va_arg(*args, map_domain_t *);
+ bool counters = va_arg(*args, int);
+ map_main_t *mm = &map_main;
+ ip6_address_t ip6_prefix;
+
+ if (d->rules)
+ memset(&ip6_prefix, 0, sizeof(ip6_prefix));
+ else
+ ip6_prefix = d->ip6_prefix;
+
+ s = format(s,
+ "[%d] ip4-pfx %U/%d ip6-pfx %U/%d ip6-src %U/%d ea_bits_len %d psid-offset %d psid-len %d mtu %d %s",
+ d - mm->domains,
+ format_ip4_address, &d->ip4_prefix, d->ip4_prefix_len,
+ format_ip6_address, &ip6_prefix, d->ip6_prefix_len,
+ format_ip6_address, &d->ip6_src, d->ip6_src_len,
+ d->ea_bits_len, d->psid_offset, d->psid_length, d->mtu,
+ (d->flags & MAP_DOMAIN_TRANSLATION) ? "map-t" : "");
+
+ if (counters) {
+ map_domain_counter_lock(mm);
+ vlib_counter_t v;
+ vlib_get_combined_counter(&mm->domain_counters[MAP_DOMAIN_COUNTER_TX], d - mm->domains, &v);
+ s = format(s, " TX: %d/%d", v.packets, v.bytes);
+ vlib_get_combined_counter(&mm->domain_counters[MAP_DOMAIN_COUNTER_RX], d - mm->domains, &v);
+ s = format(s, " RX: %d/%d", v.packets, v.bytes);
+ map_domain_counter_unlock(mm);
+ }
+
+ if (d->rules) {
+ int i;
+ ip6_address_t dst;
+ for (i = 0; i < (0x1 << d->psid_length); i++) {
+ dst = d->rules[i];
+ if (dst.as_u64[0] == 0 && dst.as_u64[1] == 0 )
+ continue;
+ s = format(s,
+ " rule psid: %d ip6-dst %U\n", i, format_ip6_address, &dst);
+ }
+ }
+ return s;
+}
+
+static u8 *
+format_map_ip4_reass (u8 *s, va_list *args)
+{
+ map_main_t *mm = &map_main;
+ map_ip4_reass_t *r = va_arg(*args, map_ip4_reass_t *);
+ map_ip4_reass_key_t *k = &r->key;
+ f64 now = vlib_time_now(mm->vlib_main);
+ f64 lifetime = (((f64)mm->ip4_reass_conf_lifetime_ms) / 1000);
+ f64 dt = (r->ts + lifetime > now) ? (r->ts + lifetime - now) : -1;
+ s = format(s,
+ "ip4-reass src=%U dst=%U protocol=%d identifier=%d port=%d lifetime=%.3lf\n",
+ format_ip4_address, &k->src.as_u8, format_ip4_address, &k->dst.as_u8,
+ k->protocol, clib_net_to_host_u16(k->fragment_id), (r->port >= 0)?clib_net_to_host_u16(r->port):-1, dt);
+ return s;
+}
+
+static u8 *
+format_map_ip6_reass (u8 *s, va_list *args)
+{
+ map_main_t *mm = &map_main;
+ map_ip6_reass_t *r = va_arg(*args, map_ip6_reass_t *);
+ map_ip6_reass_key_t *k = &r->key;
+ f64 now = vlib_time_now(mm->vlib_main);
+ f64 lifetime = (((f64)mm->ip6_reass_conf_lifetime_ms) / 1000);
+ f64 dt = (r->ts + lifetime > now) ? (r->ts + lifetime - now) : -1;
+ s = format(s,
+ "ip6-reass src=%U dst=%U protocol=%d identifier=%d lifetime=%.3lf\n",
+ format_ip6_address, &k->src.as_u8, format_ip6_address, &k->dst.as_u8,
+ k->protocol, clib_net_to_host_u32(k->fragment_id), dt);
+ return s;
+}
+
+static clib_error_t *
+show_map_domain_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ map_main_t *mm = &map_main;
+ map_domain_t *d;
+ bool counters = false;
+ u32 map_domain_index = ~0;
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "counters"))
+ counters = true;
+ else if (unformat(line_input, "index %d", &map_domain_index))
+ ;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (pool_elts(mm->domains) == 0)
+ vlib_cli_output(vm, "No MAP domains are configured...");
+
+ if (map_domain_index == ~0) {
+ pool_foreach(d, mm->domains, ({vlib_cli_output(vm, "%U", format_map_domain, d, counters);}));
+ } else {
+ if (pool_is_free_index(mm->domains, map_domain_index)) {
+ return clib_error_return(0, "MAP domain does not exists %d", map_domain_index);
+ }
+
+ d = pool_elt_at_index(mm->domains, map_domain_index);
+ vlib_cli_output(vm, "%U", format_map_domain, d, counters);
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+show_map_fragments_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ map_main_t *mm = &map_main;
+ map_ip4_reass_t *f4;
+ map_ip6_reass_t *f6;
+
+ pool_foreach(f4, mm->ip4_reass_pool, ({vlib_cli_output (vm, "%U", format_map_ip4_reass, f4);}));
+ pool_foreach(f6, mm->ip6_reass_pool, ({vlib_cli_output (vm, "%U", format_map_ip6_reass, f6);}));
+ return (0);
+}
+
+u64
+map_error_counter_get (u32 node_index, map_error_t map_error)
+{
+ vlib_main_t *vm = vlib_get_main();
+ vlib_node_runtime_t *error_node = vlib_node_get_runtime(vm, node_index);
+ vlib_error_main_t *em = &vm->error_main;
+ vlib_error_t e = error_node->errors[map_error];
+ vlib_node_t *n = vlib_get_node(vm, node_index);
+ u32 ci;
+
+ ci = vlib_error_get_code(e);
+ ASSERT (ci < n->n_errors);
+ ci += n->error_heap_index;
+
+ return (em->counters[ci]);
+}
+
+static clib_error_t *
+show_map_stats_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ map_main_t *mm = &map_main;
+ map_domain_t *d;
+ int domains = 0, rules = 0, domaincount = 0, rulecount = 0;
+ if (pool_elts (mm->domains) == 0)
+ vlib_cli_output(vm, "No MAP domains are configured...");
+
+ pool_foreach(d, mm->domains, ({
+ if (d->rules) {
+ rulecount+= 0x1 << d->psid_length;
+ rules += sizeof(ip6_address_t) * 0x1 << d->psid_length;
+ }
+ domains += sizeof(*d);
+ domaincount++;
+ }));
+
+ vlib_cli_output(vm, "MAP domains structure: %d\n", sizeof (map_domain_t));
+ vlib_cli_output(vm, "MAP domains: %d (%d bytes)\n", domaincount, domains);
+ vlib_cli_output(vm, "MAP rules: %d (%d bytes)\n", rulecount, rules);
+ vlib_cli_output(vm, "Total: %d bytes)\n", rules + domains);
+
+#if MAP_SKIP_IP6_LOOKUP
+ vlib_cli_output(vm, "MAP pre-resolve: IP6 next-hop: %U (%u), IP4 next-hop: %U (%u)\n",
+ format_ip6_address, &mm->preresolve_ip6, mm->adj6_index,
+ format_ip4_address, &mm->preresolve_ip4, mm->adj4_index);
+#endif
+
+ if (mm->tc_copy)
+ vlib_cli_output(vm, "MAP traffic-class: copy");
+ else
+ vlib_cli_output(vm, "MAP traffic-class: %x", mm->tc);
+
+ vlib_cli_output(vm, "MAP IPv6 inbound security check: %s Fragments: %s", mm->sec_check ? "enabled" : "disabled",
+ mm->sec_check_frag ? "enabled" : "disabled");
+
+
+ /*
+ * Counters
+ */
+ vlib_combined_counter_main_t *cm = mm->domain_counters;
+ u64 total_pkts[MAP_N_DOMAIN_COUNTER];
+ u64 total_bytes[MAP_N_DOMAIN_COUNTER];
+ int which, i;
+ vlib_counter_t v;
+
+ memset (total_pkts, 0, sizeof (total_pkts));
+ memset (total_bytes, 0, sizeof (total_bytes));
+
+ map_domain_counter_lock (mm);
+ vec_foreach (cm, mm->domain_counters) {
+ which = cm - mm->domain_counters;
+
+ for (i = 0; i < vec_len (cm->maxi); i++) {
+ vlib_get_combined_counter (cm, i, &v);
+ total_pkts[which] += v.packets;
+ total_bytes[which] += v.bytes;
+ }
+ }
+ map_domain_counter_unlock (mm);
+
+ vlib_cli_output(vm, "Encapsulated packets: %d bytes: %d\n", total_pkts[MAP_DOMAIN_COUNTER_TX],
+ total_bytes[MAP_DOMAIN_COUNTER_TX]);
+ vlib_cli_output(vm, "Decapsulated packets: %d bytes: %d\n", total_pkts[MAP_DOMAIN_COUNTER_RX],
+ total_bytes[MAP_DOMAIN_COUNTER_RX]);
+
+ vlib_cli_output(vm, "ICMP relayed packets: %d\n", vlib_get_simple_counter(&mm->icmp_relayed, 0));
+
+ return 0;
+}
+
+static clib_error_t *
+map_params_reass_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 lifetime = ~0;
+ f64 ht_ratio = (MAP_IP4_REASS_CONF_HT_RATIO_MAX+1);
+ u32 pool_size = ~0;
+ u64 buffers = ~(0ull);
+ u8 ip4 = 0, ip6 = 0;
+
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (!unformat(line_input, "lifetime %u", &lifetime) &&
+ !unformat(line_input, "ht-ratio %lf", &ht_ratio) &&
+ !unformat(line_input, "pool-size %u", &pool_size) &&
+ !unformat(line_input, "buffers %llu", &buffers) &&
+ !((unformat(line_input, "ip4")) && (ip4 = 1)) &&
+ !((unformat(line_input, "ip6")) && (ip6 = 1))) {
+ unformat_free(line_input);
+ return clib_error_return(0, "invalid input");
+ }
+ }
+ unformat_free(line_input);
+
+ if (!ip4 && !ip6)
+ return clib_error_return(0, "must specify ip4 and/or ip6");
+
+ if (ip4) {
+ if (pool_size != ~0 && pool_size > MAP_IP4_REASS_CONF_POOL_SIZE_MAX)
+ return clib_error_return(0, "invalid ip4-reass pool-size ( > %d)", MAP_IP4_REASS_CONF_POOL_SIZE_MAX);
+ if (ht_ratio != (MAP_IP4_REASS_CONF_HT_RATIO_MAX+1) && ht_ratio > MAP_IP4_REASS_CONF_HT_RATIO_MAX)
+ return clib_error_return(0, "invalid ip4-reass ht-ratio ( > %d)", MAP_IP4_REASS_CONF_HT_RATIO_MAX);
+ if (lifetime != ~0 && lifetime > MAP_IP4_REASS_CONF_LIFETIME_MAX)
+ return clib_error_return(0, "invalid ip4-reass lifetime ( > %d)", MAP_IP4_REASS_CONF_LIFETIME_MAX);
+ if (buffers != ~(0ull) && buffers > MAP_IP4_REASS_CONF_BUFFERS_MAX)
+ return clib_error_return(0, "invalid ip4-reass buffers ( > %ld)", MAP_IP4_REASS_CONF_BUFFERS_MAX);
+ }
+
+ if (ip6) {
+ if (pool_size != ~0 && pool_size > MAP_IP6_REASS_CONF_POOL_SIZE_MAX)
+ return clib_error_return(0, "invalid ip6-reass pool-size ( > %d)", MAP_IP6_REASS_CONF_POOL_SIZE_MAX);
+ if (ht_ratio != (MAP_IP4_REASS_CONF_HT_RATIO_MAX+1) && ht_ratio > MAP_IP6_REASS_CONF_HT_RATIO_MAX)
+ return clib_error_return(0, "invalid ip6-reass ht-log2len ( > %d)", MAP_IP6_REASS_CONF_HT_RATIO_MAX);
+ if (lifetime != ~0 && lifetime > MAP_IP6_REASS_CONF_LIFETIME_MAX)
+ return clib_error_return(0, "invalid ip6-reass lifetime ( > %d)", MAP_IP6_REASS_CONF_LIFETIME_MAX);
+ if (buffers != ~(0ull) && buffers > MAP_IP6_REASS_CONF_BUFFERS_MAX)
+ return clib_error_return(0, "invalid ip6-reass buffers ( > %ld)", MAP_IP6_REASS_CONF_BUFFERS_MAX);
+ }
+
+ if (ip4) {
+ u32 reass = 0, packets = 0;
+ if (pool_size != ~0) {
+ if (map_ip4_reass_conf_pool_size(pool_size, &reass, &packets)) {
+ vlib_cli_output(vm, "Could not set ip4-reass pool-size");
+ } else {
+ vlib_cli_output(vm, "Setting ip4-reass pool-size (destroyed-reassembly=%u , dropped-fragments=%u)", reass, packets);
+ }
+ }
+ if (ht_ratio != (MAP_IP4_REASS_CONF_HT_RATIO_MAX+1)) {
+ if (map_ip4_reass_conf_ht_ratio(ht_ratio, &reass, &packets)) {
+ vlib_cli_output(vm, "Could not set ip4-reass ht-log2len");
+ } else {
+ vlib_cli_output(vm, "Setting ip4-reass ht-log2len (destroyed-reassembly=%u , dropped-fragments=%u)", reass, packets);
+ }
+ }
+ if (lifetime != ~0) {
+ if (map_ip4_reass_conf_lifetime(lifetime))
+ vlib_cli_output(vm, "Could not set ip4-reass lifetime");
+ else
+ vlib_cli_output(vm, "Setting ip4-reass lifetime");
+ }
+ if (buffers != ~(0ull)) {
+ if (map_ip4_reass_conf_buffers(buffers))
+ vlib_cli_output(vm, "Could not set ip4-reass buffers");
+ else
+ vlib_cli_output(vm, "Setting ip4-reass buffers");
+ }
+
+ if (map_main.ip4_reass_conf_buffers >
+ map_main.ip4_reass_conf_pool_size * MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY) {
+ vlib_cli_output(vm, "Note: 'ip4-reass buffers' > pool-size * max-fragments-per-reassembly.");
+ }
+ }
+
+ if (ip6) {
+ u32 reass = 0, packets = 0;
+ if (pool_size != ~0) {
+ if (map_ip6_reass_conf_pool_size(pool_size, &reass, &packets)) {
+ vlib_cli_output(vm, "Could not set ip6-reass pool-size");
+ } else {
+ vlib_cli_output(vm, "Setting ip6-reass pool-size (destroyed-reassembly=%u , dropped-fragments=%u)", reass, packets);
+ }
+ }
+ if (ht_ratio != (MAP_IP4_REASS_CONF_HT_RATIO_MAX+1)) {
+ if (map_ip6_reass_conf_ht_ratio(ht_ratio, &reass, &packets)) {
+ vlib_cli_output(vm, "Could not set ip6-reass ht-log2len");
+ } else {
+ vlib_cli_output(vm, "Setting ip6-reass ht-log2len (destroyed-reassembly=%u , dropped-fragments=%u)", reass, packets);
+ }
+ }
+ if (lifetime != ~0) {
+ if (map_ip6_reass_conf_lifetime(lifetime))
+ vlib_cli_output(vm, "Could not set ip6-reass lifetime");
+ else
+ vlib_cli_output(vm, "Setting ip6-reass lifetime");
+ }
+ if (buffers != ~(0ull)) {
+ if (map_ip6_reass_conf_buffers(buffers))
+ vlib_cli_output(vm, "Could not set ip6-reass buffers");
+ else
+ vlib_cli_output(vm, "Setting ip6-reass buffers");
+ }
+
+ if (map_main.ip6_reass_conf_buffers >
+ map_main.ip6_reass_conf_pool_size * MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY) {
+ vlib_cli_output(vm, "Note: 'ip6-reass buffers' > pool-size * max-fragments-per-reassembly.");
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * packet trace format function
+ */
+u8 *
+format_map_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *);
+ map_trace_t *t = va_arg (*args, map_trace_t *);
+ u32 map_domain_index = t->map_domain_index;
+ u16 port = t->port;
+
+ s = format(s, "MAP domain index: %d L4 port: %u", map_domain_index, clib_net_to_host_u16(port));
+
+ return s;
+}
+
+static_always_inline map_ip4_reass_t *
+map_ip4_reass_lookup(map_ip4_reass_key_t *k, u32 bucket, f64 now)
+{
+ map_main_t *mm = &map_main;
+ u32 ri = mm->ip4_reass_hash_table[bucket];
+ while(ri != MAP_REASS_INDEX_NONE) {
+ map_ip4_reass_t * r = pool_elt_at_index(mm->ip4_reass_pool, ri);
+ if (r->key.as_u64[0] == k->as_u64[0] &&
+ r->key.as_u64[1] == k->as_u64[1] &&
+ now < r->ts + (((f64)mm->ip4_reass_conf_lifetime_ms) / 1000)) {
+ return r;
+ }
+ ri = r->bucket_next;
+ }
+ return NULL;
+}
+
+#define map_ip4_reass_pool_index(r) (r - map_main.ip4_reass_pool)
+
+void
+map_ip4_reass_free(map_ip4_reass_t *r, u32 **pi_to_drop)
+{
+ map_main_t *mm = &map_main;
+ map_ip4_reass_get_fragments(r, pi_to_drop);
+
+ // Unlink in hash bucket
+ map_ip4_reass_t *r2 = NULL;
+ u32 r2i = mm->ip4_reass_hash_table[r->bucket];
+ while (r2i != map_ip4_reass_pool_index(r)) {
+ ASSERT(r2i != MAP_REASS_INDEX_NONE);
+ r2 = pool_elt_at_index(mm->ip4_reass_pool, r2i);
+ r2i = r2->bucket_next;
+ }
+ if (r2) {
+ r2->bucket_next = r->bucket_next;
+ } else {
+ mm->ip4_reass_hash_table[r->bucket] = r->bucket_next;
+ }
+
+ // Unlink in list
+ if (r->fifo_next == map_ip4_reass_pool_index(r)) {
+ mm->ip4_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ } else {
+ if(mm->ip4_reass_fifo_last == map_ip4_reass_pool_index(r))
+ mm->ip4_reass_fifo_last = r->fifo_prev;
+ pool_elt_at_index(mm->ip4_reass_pool, r->fifo_prev)->fifo_next = r->fifo_next;
+ pool_elt_at_index(mm->ip4_reass_pool, r->fifo_next)->fifo_prev = r->fifo_prev;
+ }
+
+ pool_put(mm->ip4_reass_pool, r);
+ mm->ip4_reass_allocated--;
+}
+
+map_ip4_reass_t *
+map_ip4_reass_get(u32 src, u32 dst, u16 fragment_id,
+ u8 protocol, u32 **pi_to_drop)
+{
+ map_ip4_reass_t * r;
+ map_main_t *mm = &map_main;
+ map_ip4_reass_key_t k = {.src.data_u32 = src,
+ .dst.data_u32 = dst,
+ .fragment_id = fragment_id,
+ .protocol = protocol };
+
+ u32 h = 0;
+ h = crc_u32(k.as_u32[0], h);
+ h = crc_u32(k.as_u32[1], h);
+ h = crc_u32(k.as_u32[2], h);
+ h = crc_u32(k.as_u32[3], h);
+ h = h >> (32 - mm->ip4_reass_ht_log2len);
+
+ f64 now = vlib_time_now(mm->vlib_main);
+
+ //Cache garbage collection
+ while (mm->ip4_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ map_ip4_reass_t *last = pool_elt_at_index(mm->ip4_reass_pool, mm->ip4_reass_fifo_last);
+ if (last->ts + (((f64)mm->ip4_reass_conf_lifetime_ms) / 1000) < now)
+ map_ip4_reass_free(last, pi_to_drop);
+ else
+ break;
+ }
+
+ if ((r = map_ip4_reass_lookup(&k, h, now)))
+ return r;
+
+ if (mm->ip4_reass_allocated >= mm->ip4_reass_conf_pool_size)
+ return NULL;
+
+ pool_get(mm->ip4_reass_pool, r);
+ mm->ip4_reass_allocated++;
+ int i;
+ for (i=0; i<MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ r->fragments[i] = ~0;
+
+ u32 ri = map_ip4_reass_pool_index(r);
+
+ //Link in new bucket
+ r->bucket = h;
+ r->bucket_next = mm->ip4_reass_hash_table[h];
+ mm->ip4_reass_hash_table[h] = ri;
+
+ //Link in fifo
+ if(mm->ip4_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ r->fifo_next = pool_elt_at_index(mm->ip4_reass_pool, mm->ip4_reass_fifo_last)->fifo_next;
+ r->fifo_prev = mm->ip4_reass_fifo_last;
+ pool_elt_at_index(mm->ip4_reass_pool, r->fifo_prev)->fifo_next = ri;
+ pool_elt_at_index(mm->ip4_reass_pool, r->fifo_next)->fifo_prev = ri;
+ } else {
+ r->fifo_next = r->fifo_prev = ri;
+ mm->ip4_reass_fifo_last = ri;
+ }
+
+ //Set other fields
+ r->ts = now;
+ r->key = k;
+ r->port = -1;
+#ifdef MAP_IP4_REASS_COUNT_BYTES
+ r->expected_total = 0xffff;
+ r->forwarded = 0;
+#endif
+
+ return r;
+}
+
+int
+map_ip4_reass_add_fragment(map_ip4_reass_t *r, u32 pi)
+{
+ if (map_main.ip4_reass_buffered_counter >= map_main.ip4_reass_conf_buffers)
+ return -1;
+
+ int i;
+ for (i=0; i<MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ if(r->fragments[i] == ~0) {
+ r->fragments[i] = pi;
+ map_main.ip4_reass_buffered_counter++;
+ return 0;
+ }
+ return -1;
+}
+
+static_always_inline map_ip6_reass_t *
+map_ip6_reass_lookup(map_ip6_reass_key_t *k, u32 bucket, f64 now)
+{
+ map_main_t *mm = &map_main;
+ u32 ri = mm->ip6_reass_hash_table[bucket];
+ while(ri != MAP_REASS_INDEX_NONE) {
+ map_ip6_reass_t * r = pool_elt_at_index(mm->ip6_reass_pool, ri);
+ if(now < r->ts + (((f64)mm->ip6_reass_conf_lifetime_ms) / 1000) &&
+ r->key.as_u64[0] == k->as_u64[0] &&
+ r->key.as_u64[1] == k->as_u64[1] &&
+ r->key.as_u64[2] == k->as_u64[2] &&
+ r->key.as_u64[3] == k->as_u64[3] &&
+ r->key.as_u64[4] == k->as_u64[4])
+ return r;
+ ri = r->bucket_next;
+ }
+ return NULL;
+}
+
+#define map_ip6_reass_pool_index(r) (r - map_main.ip6_reass_pool)
+
+void
+map_ip6_reass_free(map_ip6_reass_t *r, u32 **pi_to_drop)
+{
+ map_main_t *mm = &map_main;
+ int i;
+ for (i=0; i<MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ if(r->fragments[i].pi != ~0) {
+ vec_add1(*pi_to_drop, r->fragments[i].pi);
+ r->fragments[i].pi = ~0;
+ map_main.ip6_reass_buffered_counter--;
+ }
+
+ // Unlink in hash bucket
+ map_ip6_reass_t *r2 = NULL;
+ u32 r2i = mm->ip6_reass_hash_table[r->bucket];
+ while (r2i != map_ip6_reass_pool_index(r)) {
+ ASSERT(r2i != MAP_REASS_INDEX_NONE);
+ r2 = pool_elt_at_index(mm->ip6_reass_pool, r2i);
+ r2i = r2->bucket_next;
+ }
+ if (r2) {
+ r2->bucket_next = r->bucket_next;
+ } else {
+ mm->ip6_reass_hash_table[r->bucket] = r->bucket_next;
+ }
+
+ // Unlink in list
+ if (r->fifo_next == map_ip6_reass_pool_index(r)) {
+ //Single element in the list, list is now empty
+ mm->ip6_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ } else {
+ if (mm->ip6_reass_fifo_last == map_ip6_reass_pool_index(r)) //First element
+ mm->ip6_reass_fifo_last = r->fifo_prev;
+ pool_elt_at_index(mm->ip6_reass_pool, r->fifo_prev)->fifo_next = r->fifo_next;
+ pool_elt_at_index(mm->ip6_reass_pool, r->fifo_next)->fifo_prev = r->fifo_prev;
+ }
+
+ // Free from pool if necessary
+ pool_put(mm->ip6_reass_pool, r);
+ mm->ip6_reass_allocated--;
+}
+
+map_ip6_reass_t *
+map_ip6_reass_get(ip6_address_t *src, ip6_address_t *dst, u32 fragment_id,
+ u8 protocol, u32 **pi_to_drop)
+{
+ map_ip6_reass_t * r;
+ map_main_t *mm = &map_main;
+ map_ip6_reass_key_t k = {
+ .src = *src,
+ .dst = *dst,
+ .fragment_id = fragment_id,
+ .protocol = protocol };
+
+ u32 h = 0;
+ int i;
+ for (i=0; i<10; i++)
+ h = crc_u32(k.as_u32[i], h);
+ h = h >> (32 - mm->ip6_reass_ht_log2len);
+
+ f64 now = vlib_time_now(mm->vlib_main);
+
+ //Cache garbage collection
+ while (mm->ip6_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ map_ip6_reass_t *last = pool_elt_at_index(mm->ip6_reass_pool, mm->ip6_reass_fifo_last);
+ if (last->ts + (((f64)mm->ip6_reass_conf_lifetime_ms) / 1000) < now)
+ map_ip6_reass_free(last, pi_to_drop);
+ else
+ break;
+ }
+
+ if ((r = map_ip6_reass_lookup(&k, h, now)))
+ return r;
+
+ if (mm->ip6_reass_allocated >= mm->ip6_reass_conf_pool_size)
+ return NULL;
+
+ pool_get(mm->ip6_reass_pool, r);
+ mm->ip6_reass_allocated++;
+ for (i=0; i<MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++) {
+ r->fragments[i].pi = ~0;
+ r->fragments[i].next_data_len = 0;
+ r->fragments[i].next_data_offset = 0;
+ }
+
+ u32 ri = map_ip6_reass_pool_index(r);
+
+ //Link in new bucket
+ r->bucket = h;
+ r->bucket_next = mm->ip6_reass_hash_table[h];
+ mm->ip6_reass_hash_table[h] = ri;
+
+ //Link in fifo
+ if(mm->ip6_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ r->fifo_next = pool_elt_at_index(mm->ip6_reass_pool, mm->ip6_reass_fifo_last)->fifo_next;
+ r->fifo_prev = mm->ip6_reass_fifo_last;
+ pool_elt_at_index(mm->ip6_reass_pool, r->fifo_prev)->fifo_next = ri;
+ pool_elt_at_index(mm->ip6_reass_pool, r->fifo_next)->fifo_prev = ri;
+ } else {
+ r->fifo_next = r->fifo_prev = ri;
+ mm->ip6_reass_fifo_last = ri;
+ }
+
+ //Set other fields
+ r->ts = now;
+ r->key = k;
+ r->ip4_header.ip_version_and_header_length = 0;
+#ifdef MAP_IP6_REASS_COUNT_BYTES
+ r->expected_total = 0xffff;
+ r->forwarded = 0;
+#endif
+ return r;
+}
+
+int
+map_ip6_reass_add_fragment(map_ip6_reass_t *r, u32 pi,
+ u16 data_offset, u16 next_data_offset,
+ u8 *data_start, u16 data_len)
+{
+ map_ip6_fragment_t *f = NULL, *prev_f = NULL;
+ u16 copied_len = (data_len > 20) ? 20 : data_len;
+
+ if (map_main.ip6_reass_buffered_counter >= map_main.ip6_reass_conf_buffers)
+ return -1;
+
+ //Lookup for fragments for the current buffer
+ //and the one before that
+ int i;
+ for (i=0; i<MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++) {
+ if (data_offset && r->fragments[i].next_data_offset == data_offset) {
+ prev_f = &r->fragments[i]; // This is buffer for previous packet
+ } else if (r->fragments[i].next_data_offset == next_data_offset) {
+ f = &r->fragments[i]; // This is a buffer for the current packet
+ } else if (r->fragments[i].next_data_offset == 0) { //Available
+ if (f == NULL)
+ f = &r->fragments[i];
+ else if (prev_f == NULL)
+ prev_f = &r->fragments[i];
+ }
+ }
+
+ if (!f || f->pi != ~0)
+ return -1;
+
+ if (data_offset) {
+ if (!prev_f)
+ return -1;
+
+ memcpy(prev_f->next_data, data_start, copied_len);
+ prev_f->next_data_len = copied_len;
+ prev_f->next_data_offset = data_offset;
+ } else {
+ if (((ip4_header_t *)data_start)->ip_version_and_header_length != 0x45)
+ return -1;
+
+ if (r->ip4_header.ip_version_and_header_length == 0)
+ memcpy(&r->ip4_header, data_start, sizeof(ip4_header_t));
+ }
+
+ if(data_len > 20) {
+ f->next_data_offset = next_data_offset;
+ f->pi = pi;
+ map_main.ip6_reass_buffered_counter++;
+ }
+ return 0;
+}
+
+void map_ip4_reass_reinit(u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ int i;
+
+ if(dropped_packets)
+ *dropped_packets = mm->ip4_reass_buffered_counter;
+ if(trashed_reass)
+ *trashed_reass = mm->ip4_reass_allocated;
+ if (mm->ip4_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ u16 ri = mm->ip4_reass_fifo_last;
+ do {
+ map_ip4_reass_t *r = pool_elt_at_index(mm->ip4_reass_pool, ri);
+ for (i=0; i<MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ if (r->fragments[i] != ~0)
+ map_ip4_drop_pi(r->fragments[i]);
+
+ ri = r->fifo_next;
+ pool_put(mm->ip4_reass_pool, r);
+ } while (ri != mm->ip4_reass_fifo_last);
+ }
+
+ vec_free(mm->ip4_reass_hash_table);
+ vec_resize(mm->ip4_reass_hash_table, 1 << mm->ip4_reass_ht_log2len);
+ for (i=0; i<(1 << mm->ip4_reass_ht_log2len); i++)
+ mm->ip4_reass_hash_table[i] = MAP_REASS_INDEX_NONE;
+ pool_free(mm->ip4_reass_pool);
+ pool_alloc(mm->ip4_reass_pool, mm->ip4_reass_conf_pool_size);
+
+ mm->ip4_reass_allocated = 0;
+ mm->ip4_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ mm->ip4_reass_buffered_counter = 0;
+}
+
+u8 map_get_ht_log2len(f32 ht_ratio, u16 pool_size)
+{
+ u32 desired_size = (u32)(pool_size * ht_ratio);
+ u8 i;
+ for (i=1; i<31; i++)
+ if ((1 << i) >= desired_size)
+ return i;
+ return 4;
+}
+
+int map_ip4_reass_conf_ht_ratio(f32 ht_ratio, u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ if (ht_ratio > MAP_IP4_REASS_CONF_HT_RATIO_MAX)
+ return -1;
+
+ map_ip4_reass_lock();
+ mm->ip4_reass_conf_ht_ratio = ht_ratio;
+ mm->ip4_reass_ht_log2len = map_get_ht_log2len(ht_ratio, mm->ip4_reass_conf_pool_size);
+ map_ip4_reass_reinit(trashed_reass, dropped_packets);
+ map_ip4_reass_unlock();
+ return 0;
+}
+
+int map_ip4_reass_conf_pool_size(u16 pool_size, u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ if (pool_size > MAP_IP4_REASS_CONF_POOL_SIZE_MAX)
+ return -1;
+
+ map_ip4_reass_lock();
+ mm->ip4_reass_conf_pool_size = pool_size;
+ map_ip4_reass_reinit(trashed_reass, dropped_packets);
+ map_ip4_reass_unlock();
+ return 0;
+}
+
+int map_ip4_reass_conf_lifetime(u16 lifetime_ms)
+{
+ map_main.ip4_reass_conf_lifetime_ms = lifetime_ms;
+ return 0;
+}
+
+int map_ip4_reass_conf_buffers(u32 buffers)
+{
+ map_main.ip4_reass_conf_buffers = buffers;
+ return 0;
+}
+
+void map_ip6_reass_reinit(u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ if(dropped_packets)
+ *dropped_packets = mm->ip6_reass_buffered_counter;
+ if(trashed_reass)
+ *trashed_reass = mm->ip6_reass_allocated;
+ int i;
+ if (mm->ip6_reass_fifo_last != MAP_REASS_INDEX_NONE) {
+ u16 ri = mm->ip6_reass_fifo_last;
+ do {
+ map_ip6_reass_t *r = pool_elt_at_index(mm->ip6_reass_pool, ri);
+ for (i=0; i<MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ if (r->fragments[i].pi != ~0)
+ map_ip6_drop_pi(r->fragments[i].pi);
+
+ ri = r->fifo_next;
+ pool_put(mm->ip6_reass_pool, r);
+ } while (ri != mm->ip6_reass_fifo_last);
+ mm->ip6_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ }
+
+ vec_free(mm->ip6_reass_hash_table);
+ vec_resize(mm->ip6_reass_hash_table, 1 << mm->ip6_reass_ht_log2len);
+ for(i=0; i<(1 << mm->ip6_reass_ht_log2len); i++)
+ mm->ip6_reass_hash_table[i] = MAP_REASS_INDEX_NONE;
+ pool_free(mm->ip6_reass_pool);
+ pool_alloc(mm->ip6_reass_pool, mm->ip4_reass_conf_pool_size);
+
+ mm->ip6_reass_allocated = 0;
+ mm->ip6_reass_buffered_counter = 0;
+}
+
+int map_ip6_reass_conf_ht_ratio(f32 ht_ratio, u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ if (ht_ratio > MAP_IP6_REASS_CONF_HT_RATIO_MAX)
+ return -1;
+
+ map_ip6_reass_lock();
+ mm->ip6_reass_conf_ht_ratio = ht_ratio;
+ mm->ip6_reass_ht_log2len = map_get_ht_log2len(ht_ratio, mm->ip6_reass_conf_pool_size);
+ map_ip6_reass_reinit(trashed_reass, dropped_packets);
+ map_ip6_reass_unlock();
+ return 0;
+}
+
+int map_ip6_reass_conf_pool_size(u16 pool_size, u32 *trashed_reass, u32 *dropped_packets)
+{
+ map_main_t *mm = &map_main;
+ if (pool_size > MAP_IP6_REASS_CONF_POOL_SIZE_MAX)
+ return -1;
+
+ map_ip6_reass_lock();
+ mm->ip6_reass_conf_pool_size = pool_size;
+ map_ip6_reass_reinit(trashed_reass, dropped_packets);
+ map_ip6_reass_unlock();
+ return 0;
+}
+
+int map_ip6_reass_conf_lifetime(u16 lifetime_ms)
+{
+ map_main.ip6_reass_conf_lifetime_ms = lifetime_ms;
+ return 0;
+}
+
+int map_ip6_reass_conf_buffers(u32 buffers)
+{
+ map_main.ip6_reass_conf_buffers = buffers;
+ return 0;
+}
+
+VLIB_CLI_COMMAND(map_ip4_reass_lifetime_command, static) = {
+ .path = "map params reassembly",
+ .short_help = "[ip4 | ip6] [lifetime <lifetime-ms>] [pool-size <pool-size>] [buffers <buffers>] [ht-ratio <ht-ratio>]",
+ .function = map_params_reass_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_traffic_class_command, static) = {
+ .path = "map params traffic-class",
+ .short_help =
+ "traffic-class {0x0-0xff | copy}",
+ .function = map_traffic_class_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_pre_resolve_command, static) = {
+ .path = "map params pre-resolve",
+ .short_help =
+ "pre-resolve {ip4-nh <address>} | {ip6-nh <address>}",
+ .function = map_pre_resolve_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_security_check_command, static) = {
+ .path = "map params security-check",
+ .short_help =
+ "security-check on|off",
+ .function = map_security_check_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_icmp_relay_source_address_command, static) = {
+ .path = "map params icmp-source-address",
+ .short_help =
+ "icmp-source-address <ip4-address>",
+ .function = map_icmp_relay_source_address_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_security_check_frag_command, static) = {
+ .path = "map params security-check fragments",
+ .short_help =
+ "fragments on|off",
+ .function = map_security_check_frag_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_add_domain_command, static) = {
+ .path = "map add domain",
+ .short_help =
+ "map add domain ip4-pfx <ip4-pfx> ip6-pfx <ip6-pfx> ip6-src <ip6-pfx> "
+ "ea-bits-len <n> psid-offset <n> psid-len <n> [map-t] [mtu <mtu>]",
+ .function = map_add_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_add_rule_command, static) = {
+ .path = "map add rule",
+ .short_help =
+ "map add rule index <domain> psid <psid> ip6-dst <ip6-addr>",
+ .function = map_add_rule_command_fn,
+};
+
+VLIB_CLI_COMMAND(map_del_command, static) = {
+ .path = "map del domain",
+ .short_help =
+ "map del domain index <domain>",
+ .function = map_del_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(show_map_domain_command, static) = {
+ .path = "show map domain",
+ .function = show_map_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(show_map_stats_command, static) = {
+ .path = "show map stats",
+ .function = show_map_stats_command_fn,
+};
+
+VLIB_CLI_COMMAND(show_map_fragments_command, static) = {
+ .path = "show map fragments",
+ .function = show_map_fragments_command_fn,
+};
+
+/*
+ * map_init
+ */
+clib_error_t *map_init (vlib_main_t *vm)
+{
+ map_main_t *mm = &map_main;
+ mm->vnet_main = vnet_get_main();
+ mm->vlib_main = vm;
+
+#ifdef MAP_SKIP_IP6_LOOKUP
+ memset(&mm->preresolve_ip4, 0, sizeof(mm->preresolve_ip4));
+ memset(&mm->preresolve_ip6, 0, sizeof(mm->preresolve_ip6));
+ mm->adj4_index = 0;
+ mm->adj6_index = 0;
+#endif
+
+ /* traffic class */
+ mm->tc = 0;
+ mm->tc_copy = true;
+
+ /* Inbound security check */
+ mm->sec_check = true;
+ mm->sec_check_frag = false;
+
+ vec_validate(mm->domain_counters, MAP_N_DOMAIN_COUNTER - 1);
+ mm->domain_counters[MAP_DOMAIN_COUNTER_RX].name = "rx";
+ mm->domain_counters[MAP_DOMAIN_COUNTER_TX].name = "tx";
+
+ vlib_validate_simple_counter(&mm->icmp_relayed, 0);
+ vlib_zero_simple_counter(&mm->icmp_relayed, 0);
+
+ /* IP4 virtual reassembly */
+ mm->ip4_reass_hash_table = 0;
+ mm->ip4_reass_pool = 0;
+ mm->ip4_reass_lock = clib_mem_alloc_aligned(CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
+ mm->ip4_reass_conf_ht_ratio = MAP_IP4_REASS_HT_RATIO_DEFAULT;
+ mm->ip4_reass_conf_lifetime_ms = MAP_IP4_REASS_LIFETIME_DEFAULT;
+ mm->ip4_reass_conf_pool_size = MAP_IP4_REASS_POOL_SIZE_DEFAULT;
+ mm->ip4_reass_conf_buffers = MAP_IP4_REASS_BUFFERS_DEFAULT;
+ mm->ip4_reass_ht_log2len = map_get_ht_log2len(mm->ip4_reass_conf_ht_ratio, mm->ip4_reass_conf_pool_size);
+ mm->ip4_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ map_ip4_reass_reinit(NULL, NULL);
+
+ /* IP6 virtual reassembly */
+ mm->ip6_reass_hash_table = 0;
+ mm->ip6_reass_pool = 0;
+ mm->ip6_reass_lock = clib_mem_alloc_aligned(CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
+ mm->ip6_reass_conf_ht_ratio = MAP_IP6_REASS_HT_RATIO_DEFAULT;
+ mm->ip6_reass_conf_lifetime_ms = MAP_IP6_REASS_LIFETIME_DEFAULT;
+ mm->ip6_reass_conf_pool_size = MAP_IP6_REASS_POOL_SIZE_DEFAULT;
+ mm->ip6_reass_conf_buffers = MAP_IP6_REASS_BUFFERS_DEFAULT;
+ mm->ip6_reass_ht_log2len = map_get_ht_log2len(mm->ip6_reass_conf_ht_ratio, mm->ip6_reass_conf_pool_size);
+ mm->ip6_reass_fifo_last = MAP_REASS_INDEX_NONE;
+ map_ip6_reass_reinit(NULL, NULL);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(map_init);
diff --git a/vnet/vnet/map/map.h b/vnet/vnet/map/map.h
new file mode 100644
index 00000000000..ae58cdb9120
--- /dev/null
+++ b/vnet/vnet/map/map.h
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdbool.h>
+#include <vppinfra/error.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vlib/vlib.h>
+
+#define MAP_SKIP_IP6_LOOKUP 1
+
+typedef enum {
+ MAP_SENDER,
+ MAP_RECEIVER
+} map_dir_e;
+
+int map_create_domain(ip4_address_t *ip4_prefix, u8 ip4_prefix_len,
+ ip6_address_t *ip6_prefix, u8 ip6_prefix_len,
+ ip6_address_t *ip6_src, u8 ip6_src_len,
+ u8 ea_bits_len, u8 psid_offset, u8 psid_length,
+ u32 *map_domain_index, u16 mtu, u8 flags);
+int map_delete_domain(u32 map_domain_index);
+int map_add_del_psid(u32 map_domain_index, u16 psid, ip6_address_t *tep, u8 is_add);
+u8 *format_map_trace(u8 *s, va_list *args);
+i32 ip4_get_port(ip4_header_t *ip, map_dir_e dir, u16 buffer_len);
+i32 ip6_get_port(ip6_header_t *ip6, map_dir_e dir, u16 buffer_len);
+u16 ip4_map_get_port (ip4_header_t *ip, map_dir_e dir);
+
+typedef enum __attribute__ ((__packed__)) {
+ MAP_DOMAIN_PREFIX = 1 << 0,
+ MAP_DOMAIN_TRANSLATION = 1 << 1, // The domain uses MAP-T
+} map_domain_flags_e;
+
+/**
+ * IP4 reassembly logic:
+ * One virtually reassembled flow requires a map_ip4_reass_t structure in order
+ * to keep the first-fragment port number and, optionally, cache out of sequence
+ * packets.
+ * There are up to MAP_IP4_REASS_MAX_REASSEMBLY such structures.
+ * When in use, those structures are stored in a hash table of MAP_IP4_REASS_BUCKETS buckets.
+ * When a new structure needs to be used, it is allocated from available ones.
+ * If there is no structure available, the oldest in use is selected and used if and
+ * only if it was first allocated more than MAP_IP4_REASS_LIFETIME seconds ago.
+ * In case no structure can be allocated, the fragment is dropped.
+ */
+
+#define MAP_IP4_REASS_LIFETIME_DEFAULT (100) /* ms */
+#define MAP_IP4_REASS_HT_RATIO_DEFAULT (1.0)
+#define MAP_IP4_REASS_POOL_SIZE_DEFAULT 1024 // Number of reassembly structures
+#define MAP_IP4_REASS_BUFFERS_DEFAULT 2048
+
+#define MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY 5 // Number of fragment per reassembly
+
+#define MAP_IP6_REASS_LIFETIME_DEFAULT (100) /* ms */
+#define MAP_IP6_REASS_HT_RATIO_DEFAULT (1.0)
+#define MAP_IP6_REASS_POOL_SIZE_DEFAULT 1024 // Number of reassembly structures
+#define MAP_IP6_REASS_BUFFERS_DEFAULT 2048
+
+#define MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY 5
+
+#define MAP_IP6_REASS_COUNT_BYTES
+#define MAP_IP4_REASS_COUNT_BYTES
+
+//#define IP6_MAP_T_OVERRIDE_TOS 0
+
+/*
+ * This structure _MUST_ be no larger than a single cache line (64 bytes).
+ * If more space is needed make a union of ip6_prefix and *rules, those are mutually exclusive.
+ */
+typedef struct {
+ ip6_address_t ip6_src;
+ ip6_address_t ip6_prefix;
+ ip6_address_t *rules;
+ u32 suffix_mask;
+ ip4_address_t ip4_prefix;
+ u16 psid_mask;
+ u16 mtu;
+ map_domain_flags_e flags;
+ u8 ip6_prefix_len;
+ u8 ip6_src_len;
+ u8 ea_bits_len;
+ u8 psid_offset;
+ u8 psid_length;
+
+ /* helpers */
+ u8 psid_shift;
+ u8 suffix_shift;
+ u8 ea_shift;
+
+ /* not used by forwarding */
+ u8 ip4_prefix_len;
+} map_domain_t;
+
+#define MAP_REASS_INDEX_NONE ((u16)0xffff)
+
+/*
+ * Hash key, padded out to 16 bytes for fast compare
+ */
+typedef union {
+ CLIB_PACKED (struct {
+ ip4_address_t src;
+ ip4_address_t dst;
+ u16 fragment_id;
+ u8 protocol;
+ });
+ u64 as_u64[2];
+ u32 as_u32[4];
+} map_ip4_reass_key_t;
+
+typedef struct {
+ map_ip4_reass_key_t key;
+ f64 ts;
+#ifdef MAP_IP4_REASS_COUNT_BYTES
+ u16 expected_total;
+ u16 forwarded;
+#endif
+ i32 port;
+ u16 bucket;
+ u16 bucket_next;
+ u16 fifo_prev;
+ u16 fifo_next;
+ u32 fragments[MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY];
+} map_ip4_reass_t;
+
+/*
+ * MAP domain counters
+ */
+typedef enum {
+ /* Simple counters */
+ MAP_DOMAIN_IPV4_FRAGMENT = 0,
+ /* Combined counters */
+ MAP_DOMAIN_COUNTER_RX = 0,
+ MAP_DOMAIN_COUNTER_TX,
+ MAP_N_DOMAIN_COUNTER
+} map_domain_counter_t;
+
+/*
+ * main_main_t
+ */
+typedef union {
+ CLIB_PACKED (struct {
+ ip6_address_t src;
+ ip6_address_t dst;
+ u32 fragment_id;
+ u8 protocol;
+ });
+ u64 as_u64[5];
+ u32 as_u32[10];
+} map_ip6_reass_key_t;
+
+typedef struct {
+ u32 pi; //Cached packet or ~0
+ u16 next_data_offset; //The data offset of the additional 20 bytes or ~0
+ u8 next_data_len; //Number of bytes ready to be copied (20 if not last fragment)
+ u8 next_data[20]; //The 20 additional bytes
+} map_ip6_fragment_t;
+
+typedef struct {
+ map_ip6_reass_key_t key;
+ f64 ts;
+#ifdef MAP_IP6_REASS_COUNT_BYTES
+ u16 expected_total;
+ u16 forwarded;
+#endif
+ u16 bucket; //What hash bucket this element is linked in
+ u16 bucket_next;
+ u16 fifo_prev;
+ u16 fifo_next;
+ ip4_header_t ip4_header;
+ map_ip6_fragment_t fragments[MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY];
+} map_ip6_reass_t;
+
+typedef struct {
+ /* pool of MAP domains */
+ map_domain_t *domains;
+
+ /* MAP Domain packet/byte counters indexed by map domain index */
+ vlib_simple_counter_main_t *simple_domain_counters;
+ vlib_combined_counter_main_t *domain_counters;
+ volatile u32 *counter_lock;
+
+ /* Global counters */
+ vlib_simple_counter_main_t icmp_relayed;
+
+#ifdef MAP_SKIP_IP6_LOOKUP
+ /* pre-presolve */
+ u32 adj6_index, adj4_index;
+ ip4_address_t preresolve_ip4;
+ ip6_address_t preresolve_ip6;
+#endif
+
+ /* Traffic class: zero, copy (~0) or fixed value */
+ u8 tc;
+ bool tc_copy;
+ bool sec_check;
+ bool sec_check_frag;
+
+ /* ICMPv6 -> ICMPv4 relay parameters */
+ ip4_address_t icmp_src_address;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+
+ /*
+ * IPv4 encap and decap reassembly
+ */
+ //Conf
+ f32 ip4_reass_conf_ht_ratio; //Size of ht is 2^ceil(log2(ratio*pool_size))
+ u16 ip4_reass_conf_pool_size; //Max number of allocated reass structures
+ u16 ip4_reass_conf_lifetime_ms; //Time a reassembly struct is considered valid in ms
+ u32 ip4_reass_conf_buffers; //Maximum number of buffers used by ip4 reassembly
+
+ //Runtime
+ map_ip4_reass_t *ip4_reass_pool;
+ u8 ip4_reass_ht_log2len; //Hash table size is 2^log2len
+ u16 ip4_reass_allocated;
+ u16 *ip4_reass_hash_table;
+ u16 ip4_reass_fifo_last;
+ volatile u32 *ip4_reass_lock;
+
+ //Counters
+ u32 ip4_reass_buffered_counter;
+
+ /*
+ * IPv6 decap reassembly
+ */
+ //Conf
+ f32 ip6_reass_conf_ht_ratio; //Size of ht is 2^ceil(log2(ratio*pool_size))
+ u16 ip6_reass_conf_pool_size; //Max number of allocated reass structures
+ u16 ip6_reass_conf_lifetime_ms; //Time a reassembly struct is considered valid in ms
+ u32 ip6_reass_conf_buffers; //Maximum number of buffers used by ip6 reassembly
+
+ //Runtime
+ map_ip6_reass_t *ip6_reass_pool;
+ u8 ip6_reass_ht_log2len; //Hash table size is 2^log2len
+ u16 ip6_reass_allocated;
+ u16 *ip6_reass_hash_table;
+ u16 ip6_reass_fifo_last;
+ volatile u32 *ip6_reass_lock;
+
+ //Counters
+ u32 ip6_reass_buffered_counter;
+
+} map_main_t;
+
+/*
+ * TODO: Remove SEC_CHECK / TRANSLATED_4TO6 / TRANSLATED_6TO4
+ */
+#define foreach_map_error \
+ /* Must be first. */ \
+ _(NONE, "valid MAP packets") \
+ _(BAD_PROTOCOL, "bad protocol") \
+ _(WRONG_ICMP_TYPE, "wrong icmp type") \
+ _(SEC_CHECK, "security check failed") \
+ _(ENCAP_SEC_CHECK, "encap security check failed") \
+ _(DECAP_SEC_CHECK, "decap security check failed") \
+ _(ICMP, "unable to translate ICMP") \
+ _(ICMP_RELAY, "unable to relay ICMP") \
+ _(UNKNOWN, "unknown") \
+ _(NO_DOMAIN, "no domain") \
+ _(FRAGMENTED, "packet is a fragment") \
+ _(FRAGMENT_MEMORY, "could not cache fragment") \
+ _(FRAGMENT_MALFORMED, "fragment has unexpected format")\
+ _(FRAGMENT_DROPPED, "dropped cached fragment") \
+ _(MALFORMED, "malformed packet")
+
+typedef enum {
+#define _(sym,str) MAP_ERROR_##sym,
+ foreach_map_error
+#undef _
+ MAP_N_ERROR,
+ } map_error_t;
+
+u64 map_error_counter_get(u32 node_index, map_error_t map_error);
+
+typedef struct {
+ u32 map_domain_index;
+ u16 port;
+} map_trace_t;
+
+map_main_t map_main;
+
+vlib_node_registration_t ip4_map_node;
+vlib_node_registration_t ip6_map_node;
+
+vlib_node_registration_t ip4_map_t_node;
+vlib_node_registration_t ip4_map_t_fragmented_node;
+vlib_node_registration_t ip4_map_t_tcp_udp_node;
+vlib_node_registration_t ip4_map_t_icmp_node;
+
+vlib_node_registration_t ip6_map_t_node;
+vlib_node_registration_t ip6_map_t_fragmented_node;
+vlib_node_registration_t ip6_map_t_tcp_udp_node;
+vlib_node_registration_t ip6_map_t_icmp_node;
+
+/*
+ * map_get_pfx
+ */
+static_always_inline u64
+map_get_pfx (map_domain_t *d, u32 addr, u16 port)
+{
+ u16 psid = (port >> d->psid_shift) & d->psid_mask;
+
+ if (d->ea_bits_len == 0 && d->rules)
+ return clib_net_to_host_u64(d->rules[psid].as_u64[0]);
+
+ u32 suffix = (addr >> d->suffix_shift) & d->suffix_mask;
+ u64 ea = d->ea_bits_len == 0 ? 0 : (((u64) suffix << d->psid_length)) | psid;
+
+ return clib_net_to_host_u64(d->ip6_prefix.as_u64[0]) | ea << d->ea_shift;
+}
+
+static_always_inline u64
+map_get_pfx_net (map_domain_t *d, u32 addr, u16 port)
+{
+ return clib_host_to_net_u64(map_get_pfx(d, clib_net_to_host_u32(addr),
+ clib_net_to_host_u16(port)));
+}
+
+/*
+ * map_get_sfx
+ */
+static_always_inline u64
+map_get_sfx (map_domain_t *d, u32 addr, u16 port)
+{
+ u16 psid = (port >> d->psid_shift) & d->psid_mask;
+
+ /* Shared 1:1 mode. */
+ if (d->ea_bits_len == 0 && d->rules)
+ return clib_net_to_host_u64(d->rules[psid].as_u64[1]);
+ if (d->ip6_prefix_len == 128)
+ return clib_net_to_host_u64(d->ip6_prefix.as_u64[1]);
+
+ /* IPv4 prefix */
+ if (d->flags & MAP_DOMAIN_PREFIX)
+ return (u64) (addr & ~d->suffix_mask) << 16;
+
+ /* Shared or full IPv4 address */
+ return ((u64) addr << 16) | psid;
+}
+
+static_always_inline u64
+map_get_sfx_net (map_domain_t *d, u32 addr, u16 port)
+{
+ return clib_host_to_net_u64(map_get_sfx(d, clib_net_to_host_u32(addr),
+ clib_net_to_host_u16(port)));
+}
+
+static_always_inline u32
+map_get_ip4 (ip6_address_t *addr)
+{
+ return clib_host_to_net_u32(clib_net_to_host_u64(addr->as_u64[1]) >> 16);
+}
+
+/*
+ * Get the MAP domain from an IPv4 lookup adjacency.
+ */
+static_always_inline map_domain_t *
+ip4_map_get_domain (u32 adj_index, u32 *map_domain_index)
+{
+ map_main_t *mm = &map_main;
+ ip_lookup_main_t *lm = &ip4_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm, adj_index);
+ ASSERT(adj);
+ uword *p = (uword *)adj->rewrite_data;
+ ASSERT(p);
+ *map_domain_index = p[0];
+ return pool_elt_at_index(mm->domains, p[0]);
+}
+
+/*
+ * Get the MAP domain from an IPv6 lookup adjacency.
+ * If the IPv6 address or prefix is not shared, no lookup is required.
+ * The IPv4 address is used otherwise.
+ */
+static_always_inline map_domain_t *
+ip6_map_get_domain (u32 adj_index, ip4_address_t *addr,
+ u32 *map_domain_index, u8 *error)
+{
+ map_main_t *mm = &map_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm6, adj_index);
+ ASSERT(adj);
+ uword *p = (uword *)adj->rewrite_data;
+ ASSERT(p);
+ *map_domain_index = p[0];
+ if (p[0] != ~0)
+ return pool_elt_at_index(mm->domains, p[0]);
+
+ u32 ai = ip4_fib_lookup_with_table(im4, 0, addr, 0);
+ ip_adjacency_t *adj4 = ip_get_adjacency (lm4, ai);
+ if (PREDICT_TRUE(adj4->lookup_next_index == IP_LOOKUP_NEXT_MAP ||
+ adj4->lookup_next_index == IP_LOOKUP_NEXT_MAP_T)) {
+ uword *p = (uword *)adj4->rewrite_data;
+ *map_domain_index = p[0];
+ return pool_elt_at_index(mm->domains, *map_domain_index);
+ }
+ *error = MAP_ERROR_NO_DOMAIN;
+ return NULL;
+}
+
+map_ip4_reass_t *
+map_ip4_reass_get(u32 src, u32 dst, u16 fragment_id,
+ u8 protocol, u32 **pi_to_drop);
+void
+map_ip4_reass_free(map_ip4_reass_t *r, u32 **pi_to_drop);
+
+#define map_ip4_reass_lock() while (__sync_lock_test_and_set(map_main.ip4_reass_lock, 1)) {}
+#define map_ip4_reass_unlock() do {CLIB_MEMORY_BARRIER(); *map_main.ip4_reass_lock = 0;} while(0)
+
+static_always_inline void
+map_ip4_reass_get_fragments(map_ip4_reass_t *r, u32 **pi)
+{
+ int i;
+ for (i=0; i<MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY; i++)
+ if(r->fragments[i] != ~0) {
+ vec_add1(*pi, r->fragments[i]);
+ r->fragments[i] = ~0;
+ map_main.ip4_reass_buffered_counter--;
+ }
+}
+
+int map_ip4_reass_add_fragment(map_ip4_reass_t *r, u32 pi);
+
+map_ip6_reass_t *
+map_ip6_reass_get(ip6_address_t *src, ip6_address_t *dst, u32 fragment_id,
+ u8 protocol, u32 **pi_to_drop);
+void
+map_ip6_reass_free(map_ip6_reass_t *r, u32 **pi_to_drop);
+
+#define map_ip6_reass_lock() while (__sync_lock_test_and_set(map_main.ip6_reass_lock, 1)) {}
+#define map_ip6_reass_unlock() do {CLIB_MEMORY_BARRIER(); *map_main.ip6_reass_lock = 0;} while(0)
+
+int
+map_ip6_reass_add_fragment(map_ip6_reass_t *r, u32 pi,
+ u16 data_offset, u16 next_data_offset,
+ u8 *data_start, u16 data_len);
+
+void map_ip4_drop_pi(u32 pi);
+
+int map_ip4_reass_conf_ht_ratio(f32 ht_ratio, u32 *trashed_reass, u32 *dropped_packets);
+#define MAP_IP4_REASS_CONF_HT_RATIO_MAX 100
+int map_ip4_reass_conf_pool_size(u16 pool_size, u32 *trashed_reass, u32 *dropped_packets);
+#define MAP_IP4_REASS_CONF_POOL_SIZE_MAX (0xfeff)
+int map_ip4_reass_conf_lifetime(u16 lifetime_ms);
+#define MAP_IP4_REASS_CONF_LIFETIME_MAX 0xffff
+int map_ip4_reass_conf_buffers(u32 buffers);
+#define MAP_IP4_REASS_CONF_BUFFERS_MAX (0xffffffff)
+
+void map_ip6_drop_pi(u32 pi);
+
+
+int map_ip6_reass_conf_ht_ratio(f32 ht_ratio, u32 *trashed_reass, u32 *dropped_packets);
+#define MAP_IP6_REASS_CONF_HT_RATIO_MAX 100
+int map_ip6_reass_conf_pool_size(u16 pool_size, u32 *trashed_reass, u32 *dropped_packets);
+#define MAP_IP6_REASS_CONF_POOL_SIZE_MAX (0xfeff)
+int map_ip6_reass_conf_lifetime(u16 lifetime_ms);
+#define MAP_IP6_REASS_CONF_LIFETIME_MAX 0xffff
+int map_ip6_reass_conf_buffers(u32 buffers);
+#define MAP_IP6_REASS_CONF_BUFFERS_MAX (0xffffffff)
+
+static_always_inline
+int ip6_parse(const ip6_header_t *ip6, u32 buff_len,
+ u8 *l4_protocol, u16 *l4_offset, u16 *frag_hdr_offset)
+{
+ if (ip6->protocol == IP_PROTOCOL_IPV6_FRAGMENTATION) {
+ *l4_protocol = ((ip6_frag_hdr_t *)(ip6 + 1))->next_hdr;
+ *frag_hdr_offset = sizeof(*ip6);
+ *l4_offset = sizeof(*ip6) + sizeof(ip6_frag_hdr_t);
+ } else {
+ *l4_protocol = ip6->protocol;
+ *frag_hdr_offset = 0;
+ *l4_offset = sizeof(*ip6);
+ }
+
+ return (buff_len < (*l4_offset + 4)) ||
+ (clib_net_to_host_u16(ip6->payload_length) < (*l4_offset + 4 - sizeof(*ip6)));
+}
+
+
+#define u8_ptr_add(ptr, index) (((u8 *)ptr) + index)
+#define u16_net_add(u, val) clib_host_to_net_u16(clib_net_to_host_u16(u) + (val))
+
+#define frag_id_6to4(id) ((id) ^ ((id) >> 16))
+
+static_always_inline void
+ip4_map_t_embedded_address (map_domain_t *d,
+ ip6_address_t *ip6, const ip4_address_t *ip4)
+{
+ ASSERT(d->ip6_src_len == 96); //No support for other lengths for now
+ ip6->as_u64[0] = d->ip6_src.as_u64[0];
+ ip6->as_u32[2] = d->ip6_src.as_u32[2];
+ ip6->as_u32[3] = ip4->as_u32;
+}
+
+static_always_inline u32
+ip6_map_t_embedded_address (map_domain_t *d, ip6_address_t *addr)
+{
+ ASSERT(d->ip6_src_len == 96); //No support for other lengths for now
+ return addr->as_u32[3];
+}
+
+static inline void
+map_domain_counter_lock (map_main_t *mm)
+{
+ if (mm->counter_lock)
+ while (__sync_lock_test_and_set(mm->counter_lock, 1))
+ /* zzzz */ ;
+}
+static inline void
+map_domain_counter_unlock (map_main_t *mm)
+{
+ if (mm->counter_lock)
+ *mm->counter_lock = 0;
+}
+
+
+static_always_inline void
+map_send_all_to_node(vlib_main_t *vm, u32 *pi_vector,
+ vlib_node_runtime_t *node, vlib_error_t *error,
+ u32 next)
+{
+ u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
+ //Deal with fragments that are ready
+ from = pi_vector;
+ n_left_from = vec_len(pi_vector);
+ next_index = node->cached_next_index;
+ while (n_left_from > 0) {
+ vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next);
+ while (n_left_from > 0 && n_left_to_next > 0) {
+ u32 pi0 = to_next[0] = from[0];
+ from += 1;
+ n_left_from -= 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ vlib_buffer_t *p0 = vlib_get_buffer(vm, pi0);
+ p0->error = *error;
+ vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, pi0, next);
+ }
+ vlib_put_next_frame(vm, node, next_index, n_left_to_next);
+ }
+}
diff --git a/vnet/vnet/map/map_doc.md b/vnet/vnet/map/map_doc.md
new file mode 100644
index 00000000000..230c52dfafd
--- /dev/null
+++ b/vnet/vnet/map/map_doc.md
@@ -0,0 +1,69 @@
+# VPP MAP and Lw4o6 implementation
+
+This is a memo intended to contain documentation of the VPP MAP and Lw4o6 implementations.
+Everything that is not directly obvious should come here.
+
+
+
+## MAP-E Virtual Reassembly
+
+The MAP-E implementation supports handling of IPv4 fragments as well as IPv4-in-IPv6 inner and outer fragments. This is called virtual reassembly because the fragments are not actually reassembled. Instead, some meta-data are kept about the first fragment and reused for subsequent fragments.
+
+Fragment caching and handling is not always necessary. It is performed when:
+* An IPv4 fragment is received and the destination IPv4 address is shared.
+* An IPv6 packet is received with an inner IPv4 fragment, the IPv4 source address is shared, and 'security-check fragments' is on.
+* An IPv6 fragment is received.
+
+There are 3 dedicated nodes:
+* ip4-map-reass
+* ip6-map-ip4-reass
+* ip6-map-ip6-reass
+
+ip4-map sends all fragments to ip4-map-reass.
+ip6-map sends all inner-fragments to ip6-map-ip4-reass.
+ip6-map sends all outer-fragments to ip6-map-ip6-reass.
+
+IPv4 (resp. IPv6) virtual reassembly makes use of a hash table in order to store IPv4 (resp. IPv6) reassembly structures. The hash-key is based on the IPv4-src:IPv4-dst:Frag-ID:Protocol tuple (resp. IPv6-src:IPv6-dst:Frag-ID tuple, as the protocol is IPv4-in-IPv6). Therefore, each packet reassembly makes use of exactly one reassembly structure. When such a structure is allocated, it is timestamped with the current time. Finally, those structures are capable of storing a limited number of buffer indexes.
+
+An IPv4 (resp. IPv6) reassembly structure can cache up to MAP_IP4_REASS_MAX_FRAGMENTS_PER_REASSEMBLY (resp. MAP_IP6_REASS_MAX_FRAGMENTS_PER_REASSEMBLY) buffers. Buffers are cached until the first fragment is received.
+
+#### Virtual Reassembly configuration
+
+IPv4 and IPv6 virtual reassembly support the following configuration:
+ map params reassembly [ip4 | ip6] [lifetime <lifetime-ms>] [pool-size <pool-size>] [buffers <buffers>] [ht-ratio <ht-ratio>]
+
+lifetime:
+ The time in milliseconds a reassembly structure is considered valid. The longer, the more reliable is reassembly, but the more likely it is to exhaust the pool of reassembly structures. IPv4 standard suggests a lifetime of 15 seconds. IPv6 specifies a lifetime of 60 people. Those values are not realistic for high-throughput cases.
+
+buffers:
+ The upper limit of buffers that are allowed to be cached. It can be used to protect against fragmentation attacks which would aim to exhaust the global buffers pool.
+
+pool-size:
+ The number of reassembly structures that can be allocated. As each structure can store a small fixed number of fragments, it also sets an upper-bound of 'pool-size * MAP_IPX_REASS_MAX_FRAGMENTS_PER_REASSEMBLY' buffers that can be cached in total.
+
+ht-ratio:
+ The amount of buckets in the hash-table is pool-size * ht-ratio.
+
+
+Any time pool-size and ht-ratio is modified, the hash-table is destroyed and created again, which means all current state is lost.
+
+
+##### Additional considerations
+
+Reassembly at high rate is expensive in terms of buffers. There is a trade-off between the lifetime and number of allocated buffers. Reducing the lifetime helps, but at the cost of loosing state for fragments that are wide appart.
+
+Let:
+R be the packet rate at which fragments are received.
+F be the number of fragments per packet.
+
+Assuming the first fragment is always received last. We should have:
+buffers > lifetime * R / F * (F - 1)
+pool-size > lifetime * R/F
+
+This is a worst case. Receiving the first fragment earlier helps reducing the number of required buffers. Also, an optimization is implemented (MAP_IP6_REASS_COUNT_BYTES and MAP_IP4_REASS_COUNT_BYTES) which counts the number of transmitted bytes and remembers the total number of bytes which should be transmitted based on the last fragment, and therefore helps reducing 'pool-size'.
+
+But the formula shows that it is challenging to forward a significant amount of fragmented packets at high rates. For instance, with a lifetime of 1 second, 5Mpps packet rate would require buffering up to 2.5 millions fragments.
+
+If you want to do that, be prepared to configure a lot of fragments.
+
+
diff --git a/vnet/vnet/map/sixrd.c b/vnet/vnet/map/sixrd.c
new file mode 100644
index 00000000000..26b4eea9a86
--- /dev/null
+++ b/vnet/vnet/map/sixrd.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sixrd.h"
+
+/*
+ * This code supports the following sixrd modes:
+ *
+ * 32 EA bits (Complete IPv4 address is embedded):
+ * ea_bits_len = 32
+ * IPv4 suffix is embedded:
+ * ea_bits_len = < 32
+ * No embedded address bits (1:1 mode):
+ * ea_bits_len = 0
+ */
+
+int
+sixrd_create_domain (ip6_address_t *ip6_prefix,
+ u8 ip6_prefix_len,
+ ip4_address_t *ip4_prefix,
+ u8 ip4_prefix_len,
+ ip4_address_t *ip4_src,
+ u32 *sixrd_domain_index,
+ u16 mtu)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip6_main_t *im6 = &ip6_main;
+ sixrd_domain_t *d;
+ ip_adjacency_t adj;
+ ip4_add_del_route_args_t args4;
+ ip6_add_del_route_args_t args6;
+ u32 *p;
+
+ /* Get domain index */
+ pool_get_aligned(mm->domains, d, CLIB_CACHE_LINE_BYTES);
+ memset(d, 0, sizeof (*d));
+ *sixrd_domain_index = d - mm->domains;
+
+ /* Init domain struct */
+ d->ip4_prefix.as_u32 = ip4_prefix->as_u32;
+ d->ip4_prefix_len = ip4_prefix_len;
+ d->ip6_prefix = *ip6_prefix;
+ d->ip6_prefix_len = ip6_prefix_len;
+ d->ip4_src = *ip4_src;
+ d->mtu = mtu;
+
+ if (ip4_prefix_len < 32)
+ d->shift = 64 - ip6_prefix_len + (32 - ip4_prefix_len);
+
+ /* Init IP adjacency */
+ memset(&adj, 0, sizeof(adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = IP_LOOKUP_NEXT_SIXRD;
+ p = (u32 *)&adj.rewrite_data[0];
+ *p = (u32) (*sixrd_domain_index);
+
+ /* Create ip6 adjacency */
+ memset(&args6, 0, sizeof(args6));
+ args6.table_index_or_table_id = 0;
+ args6.flags = IP6_ROUTE_FLAG_ADD;
+ args6.dst_address.as_u64[0] = ip6_prefix->as_u64[0];
+ args6.dst_address.as_u64[1] = ip6_prefix->as_u64[1];
+ args6.dst_address_length = ip6_prefix_len;
+ args6.adj_index = ~0;
+ args6.add_adj = &adj;
+ args6.n_add_adj = 1;
+ ip6_add_del_route(im6, &args6);
+
+ /* Multiple SIXRD domains may share same source IPv4 TEP */
+ uword *q = ip4_get_route(im4, 0, 0, (u8 *)ip4_src, 32);
+ if (q) {
+ u32 ai = q[0];
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_adjacency_t *adj4 = ip_get_adjacency(lm4, ai);
+ if (adj4->lookup_next_index != IP_LOOKUP_NEXT_SIXRD) {
+ clib_warning("BR source address already assigned: %U", format_ip4_address, ip4_src);
+ pool_put(mm->domains, d);
+ return -1;
+ }
+ /* Shared source */
+ p = (u32 *)&adj4->rewrite_data[0];
+ p[0] = ~0;
+
+ /* Add refcount, so we don't accidentially delete the route underneath someone */
+ p[1]++;
+ } else {
+ /* Create ip4 adjacency. */
+ memset(&args4, 0, sizeof(args4));
+ args4.table_index_or_table_id = 0;
+ args4.flags = IP4_ROUTE_FLAG_ADD;
+ args4.dst_address.as_u32 = ip4_src->as_u32;
+ args4.dst_address_length = 32;
+ args4.adj_index = ~0;
+ args4.add_adj = &adj;
+ args4.n_add_adj = 1;
+ ip4_add_del_route(im4, &args4);
+ }
+
+ return 0;
+}
+
+/*
+ * sixrd_delete_domain
+ */
+int
+sixrd_delete_domain (u32 sixrd_domain_index)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ ip4_main_t *im4 = &ip4_main;
+ ip6_main_t *im6 = &ip6_main;
+ sixrd_domain_t *d;
+ ip_adjacency_t adj;
+ ip4_add_del_route_args_t args4;
+ ip6_add_del_route_args_t args6;
+
+ if (pool_is_free_index(mm->domains, sixrd_domain_index)) {
+ clib_warning("SIXRD domain delete: domain does not exist: %d", sixrd_domain_index);
+ return -1;
+ }
+
+ d = pool_elt_at_index(mm->domains, sixrd_domain_index);
+
+ memset(&adj, 0, sizeof(adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = IP_LOOKUP_NEXT_SIXRD;
+
+ /* Delete ip6 adjacency */
+ memset(&args6, 0, sizeof (args6));
+ args6.table_index_or_table_id = 0;
+ args6.flags = IP6_ROUTE_FLAG_DEL;
+ args6.dst_address.as_u64[0] = d->ip6_prefix.as_u64[0];
+ args6.dst_address.as_u64[1] = d->ip6_prefix.as_u64[1];
+ args6.dst_address_length = d->ip6_prefix_len;
+ args6.adj_index = 0;
+ args6.add_adj = &adj;
+ args6.n_add_adj = 0;
+ ip6_add_del_route(im6, &args6);
+
+ /* Delete ip4 adjacency */
+ uword *q = ip4_get_route(im4, 0, 0, (u8 *)&d->ip4_src, 32);
+ if (q) {
+ u32 ai = q[0];
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_adjacency_t *adj4 = ip_get_adjacency(lm4, ai);
+
+ u32 *p = (u32 *)&adj4->rewrite_data[0];
+ /* Delete route when no other domains use this source */
+ if (p[1] == 0) {
+ memset(&args4, 0, sizeof(args4));
+ args4.table_index_or_table_id = 0;
+ args4.flags = IP4_ROUTE_FLAG_DEL;
+ args4.dst_address.as_u32 = d->ip4_prefix.as_u32;
+ args4.dst_address_length = d->ip4_prefix_len;
+ args4.adj_index = 0;
+ args4.add_adj = &adj;
+ args4.n_add_adj = 0;
+ ip4_add_del_route(im4, &args4);
+ }
+ p[1]--;
+ }
+
+ pool_put(mm->domains, d);
+
+ return 0;
+}
+
+static clib_error_t *
+sixrd_add_domain_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ ip4_address_t ip4_prefix;
+ ip6_address_t ip6_prefix;
+ ip4_address_t ip4_src;
+ u32 ip6_prefix_len, ip4_prefix_len, sixrd_domain_index;
+ u32 num_m_args = 0;
+ /* Optional arguments */
+ u32 mtu = 0;
+
+ /* Get a line of input. */
+ if (!unformat_user(input, unformat_line_input, line_input))
+ return 0;
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "ip6-pfx %U/%d", unformat_ip6_address, &ip6_prefix, &ip6_prefix_len))
+ num_m_args++;
+ else if (unformat(line_input, "ip4-pfx %U/%d", unformat_ip4_address, &ip4_prefix, &ip4_prefix_len))
+ num_m_args++;
+ else if (unformat(line_input, "ip4-src %U", unformat_ip4_address, &ip4_src))
+ num_m_args++;
+ else if (unformat(line_input, "mtu %d", &mtu))
+ num_m_args++;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (num_m_args < 3)
+ return clib_error_return(0, "mandatory argument(s) missing");
+
+ sixrd_create_domain(&ip6_prefix, ip6_prefix_len, &ip4_prefix, ip4_prefix_len,
+ &ip4_src, &sixrd_domain_index, mtu);
+
+ return 0;
+}
+
+static clib_error_t *
+sixrd_del_domain_command_fn (vlib_main_t *vm,
+ unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ unformat_input_t _line_input, *line_input = &_line_input;
+ u32 num_m_args = 0;
+ u32 sixrd_domain_index;
+
+ /* Get a line of input. */
+ if (! unformat_user(input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(line_input, "index %d", &sixrd_domain_index))
+ num_m_args++;
+ else
+ return clib_error_return(0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+ unformat_free(line_input);
+
+ if (num_m_args != 1)
+ return clib_error_return(0, "mandatory argument(s) missing");
+
+ sixrd_delete_domain(sixrd_domain_index);
+
+ return 0;
+}
+
+static u8 *
+format_sixrd_domain (u8 *s, va_list *args)
+{
+ sixrd_domain_t *d = va_arg(*args, sixrd_domain_t *);
+ sixrd_main_t *mm = &sixrd_main;
+
+ s = format(s,
+ "[%d] ip6-pfx %U/%d ip4-pfx %U/%d ip4-src %U mtu %d",
+ d - mm->domains,
+ format_ip6_address, &d->ip6_prefix, d->ip6_prefix_len,
+ format_ip4_address, &d->ip4_prefix, d->ip4_prefix_len,
+ format_ip4_address, &d->ip4_src, d->mtu);
+
+ return s;
+}
+
+static clib_error_t *
+show_sixrd_domain_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ sixrd_domain_t *d;
+
+ if (pool_elts(mm->domains) == 0)
+ vlib_cli_output(vm, "No SIXRD domains are configured...");
+
+ pool_foreach(d, mm->domains, ({vlib_cli_output(vm, "%U", format_sixrd_domain, d);}));
+
+ return 0;
+
+}
+
+static clib_error_t *
+show_sixrd_stats_command_fn (vlib_main_t *vm, unformat_input_t *input, vlib_cli_command_t *cmd)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ sixrd_domain_t *d;
+ int domains = 0, domaincount = 0;
+ if (pool_elts (mm->domains) == 0)
+ vlib_cli_output (vm, "No SIXRD domains are configured...");
+
+ pool_foreach(d, mm->domains, ({
+ domains += sizeof(*d);
+ domaincount++;
+ }));
+
+ vlib_cli_output(vm, "SIXRD domains structure: %d\n", sizeof (sixrd_domain_t));
+ vlib_cli_output(vm, "SIXRD domains: %d (%d bytes)\n", domaincount, domains);
+
+ return 0;
+}
+
+/*
+ * packet trace format function
+ */
+u8 *
+format_sixrd_trace (u8 *s, va_list *args)
+{
+ CLIB_UNUSED(vlib_main_t *vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED(vlib_node_t *node) = va_arg (*args, vlib_node_t *);
+ sixrd_trace_t *t = va_arg (*args, sixrd_trace_t *);
+ u32 sixrd_domain_index = t->sixrd_domain_index;
+
+ s = format(s, "SIXRD domain index: %d", sixrd_domain_index);
+
+ return s;
+}
+
+VLIB_CLI_COMMAND(sixrd_add_domain_command, static) = {
+ .path = "sixrd add domain",
+ .short_help =
+ "sixrd add domain ip6-pfx <ip6-pfx> ip4-pfx <ip4-pfx> ip4-src <ip4-addr>",
+ .function = sixrd_add_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(sixrd_del_command, static) = {
+ .path = "sixrd del domain",
+ .short_help =
+ "sixrd del domain index <domain>",
+ .function = sixrd_del_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(show_sixrd_domain_command, static) = {
+ .path = "show sixrd domain",
+ .function = show_sixrd_domain_command_fn,
+};
+
+VLIB_CLI_COMMAND(show_sixrd_stats_command, static) = {
+ .path = "show sixrd stats",
+ .function = show_sixrd_stats_command_fn,
+};
+
+/*
+ * sixrd_init
+ */
+clib_error_t *sixrd_init (vlib_main_t *vm)
+{
+ sixrd_main_t *mm = &sixrd_main;
+
+ mm->vnet_main = vnet_get_main();
+ mm->vlib_main = vm;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(sixrd_init);
diff --git a/vnet/vnet/map/sixrd.h b/vnet/vnet/map/sixrd.h
new file mode 100644
index 00000000000..d741cb278b5
--- /dev/null
+++ b/vnet/vnet/map/sixrd.h
@@ -0,0 +1,144 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#include <stdbool.h>
+#include <vppinfra/error.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+
+vlib_node_registration_t ip6_sixrd_node;
+vlib_node_registration_t ip4_sixrd_node;
+
+int sixrd_create_domain(ip6_address_t *ip6_prefix, u8 ip6_prefix_len,
+ ip4_address_t *ip4_prefix, u8 ip4_prefix_len,
+ ip4_address_t *ip4_src, u32 *sixrd_domain_index, u16 mtu);
+int sixrd_delete_domain(u32 sixrd_domain_index);
+u8 *format_sixrd_trace(u8 *s, va_list *args);
+
+typedef struct {
+ ip6_address_t ip6_prefix;
+ ip4_address_t ip4_prefix;
+ ip4_address_t ip4_src;
+ u8 ip6_prefix_len;
+ u8 ip4_prefix_len;
+
+ /* helpers */
+ u8 shift;
+
+ u16 mtu;
+} sixrd_domain_t;
+
+typedef struct {
+ /* pool of SIXRD domains */
+ sixrd_domain_t *domains;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+} sixrd_main_t;
+
+#define foreach_sixrd_error \
+ /* Must be first. */ \
+ _(NONE, "valid SIXRD packets") \
+ _(BAD_PROTOCOL, "bad protocol") \
+ _(WRONG_ICMP_TYPE, "wrong icmp type") \
+ _(SEC_CHECK, "security check failed") \
+ _(ICMP, "unable to translate ICMP") \
+ _(UNKNOWN, "unknown") \
+ _(NO_DOMAIN, "no domain") \
+ _(ENCAPSULATED, "encapsulated") \
+ _(DECAPSULATED, "decapsulated") \
+ _(TRANSLATED_4TO6, "translated 4 to 6") \
+ _(TRANSLATED_6TO4, "translated 6 to 4") \
+ _(FRAGMENT, "fragment handling error") \
+ _(FRAGMENT_QUEUED, "dropped, missing first fragment") \
+ _(FRAGMENTED, "packets requiring fragmentation") \
+ _(FRAGMENT_PARTS, "fragment parts") \
+ _(MALFORMED, "malformed packet")
+
+typedef enum {
+#define _(sym,str) SIXRD_ERROR_##sym,
+ foreach_sixrd_error
+#undef _
+ SIXRD_N_ERROR,
+ } sixrd_error_t;
+
+typedef struct {
+ u32 sixrd_domain_index;
+} sixrd_trace_t;
+
+sixrd_main_t sixrd_main;
+
+/*
+ * sixrd_get_addr
+ */
+static_always_inline u32
+sixrd_get_addr (sixrd_domain_t *d, u64 dal)
+{
+
+ /* 1:1 mode */
+ if (d->ip4_prefix_len == 32) return (d->ip4_prefix.as_u32);
+
+ /* Grab 32 - ip4_prefix_len bits out of IPv6 address from offset ip6_prefix_len */
+ return (d->ip4_prefix.as_u32 | (u32)(dal >> d->shift));
+}
+
+/*
+ * Get the SIXRD domain from an IPv6 lookup adjacency.
+ */
+static_always_inline sixrd_domain_t *
+ip6_sixrd_get_domain (u32 adj_index, u32 *sixrd_domain_index)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ ip_lookup_main_t *lm = &ip6_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm, adj_index);
+ ASSERT(adj);
+ uword *p = (uword *)adj->rewrite_data;
+ ASSERT(p);
+ *sixrd_domain_index = p[0];
+ return pool_elt_at_index(mm->domains, p[0]);
+}
+
+/*
+ * Get the SIXRD domain from an IPv4 lookup adjacency.
+ * If the IPv4 address is not shared, no lookup is required.
+ * The IPv6 address is used otherwise.
+ */
+static_always_inline sixrd_domain_t *
+ip4_sixrd_get_domain (u32 adj_index, ip6_address_t *addr,
+ u32 *sixrd_domain_index, u8 *error)
+{
+ sixrd_main_t *mm = &sixrd_main;
+ ip6_main_t *im6 = &ip6_main;
+ ip_lookup_main_t *lm4 = &ip4_main.lookup_main;
+ ip_lookup_main_t *lm6 = &ip6_main.lookup_main;
+ ip_adjacency_t *adj = ip_get_adjacency(lm4, adj_index);
+ ASSERT(adj);
+ uword *p = (uword *)adj->rewrite_data;
+ ASSERT(p);
+ *sixrd_domain_index = p[0];
+ if (p[0] != ~0)
+ return pool_elt_at_index(mm->domains, p[0]);
+
+ u32 ai = ip6_fib_lookup_with_table(im6, 0, addr);
+ ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai);
+ if (PREDICT_TRUE(adj6->lookup_next_index == IP_LOOKUP_NEXT_SIXRD)) {
+ uword *p = (uword *)adj6->rewrite_data;
+ *sixrd_domain_index = p[0];
+ return pool_elt_at_index(mm->domains, *sixrd_domain_index);
+ }
+ *error = SIXRD_ERROR_NO_DOMAIN;
+ return NULL;
+}
diff --git a/vnet/vnet/mcast/mcast.c b/vnet/vnet/mcast/mcast.c
new file mode 100644
index 00000000000..e9177c71f2c
--- /dev/null
+++ b/vnet/vnet/mcast/mcast.c
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/mcast/mcast.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/icmp46_packet.h>
+#include <vnet/ip/ip4.h>
+
+typedef struct {
+ u32 sw_if_index;
+ u32 next_index;
+ u32 group_index;
+} mcast_prep_trace_t;
+
+/* packet trace format function */
+static u8 * format_mcast_prep_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mcast_prep_trace_t * t = va_arg (*args, mcast_prep_trace_t *);
+
+ s = format (s, "MCAST_PREP: group %d, next index %d, tx_sw_if_index %d",
+ t->group_index, t->next_index, t->sw_if_index);
+ return s;
+}
+
+mcast_main_t mcast_main;
+vlib_node_registration_t mcast_prep_node;
+vlib_node_registration_t mcast_recycle_node;
+
+#define foreach_mcast_prep_error \
+_(MCASTS, "Multicast Packets")
+
+typedef enum {
+#define _(sym,str) MCAST_PREP_ERROR_##sym,
+ foreach_mcast_prep_error
+#undef _
+ MCAST_PREP_N_ERROR,
+} mcast_prep_error_t;
+
+static char * mcast_prep_error_strings[] = {
+#define _(sym,string) string,
+ foreach_mcast_prep_error
+#undef _
+};
+
+typedef enum {
+ MCAST_PREP_NEXT_DROP,
+ MCAST_PREP_N_NEXT,
+} mcast_prep_next_t;
+
+static uword
+mcast_prep_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ mcast_prep_next_t next_index;
+ mcast_main_t * mcm = &mcast_main;
+ vlib_node_t *n = vlib_get_node (vm, mcast_prep_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (0 && n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = 0;
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ next1 = 0;
+
+ /* $$$$ your message in this space. Process 2 x pkts */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ mcast_prep_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ mcast_prep_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0, adj_index0;
+ mcast_group_t * g0;
+ ip_adjacency_t * adj0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
+ adj0 = ip_get_adjacency (lm, adj_index0);
+ vnet_buffer(b0)->mcast.mcast_group_index = adj0->mcast_group_index;
+ g0 = pool_elt_at_index (mcm->groups, adj0->mcast_group_index);
+
+ /*
+ * Handle the degenerate single-copy case
+ * If we don't change the freelist, the packet will never
+ * make it to the recycle node...
+ */
+ if (PREDICT_TRUE(vec_len (g0->members) > 1))
+ {
+ /* Save the original free list index */
+ vnet_buffer(b0)->mcast.original_free_list_index =
+ b0->free_list_index;
+
+ /* Swap in the multicast recycle list */
+ b0->free_list_index = mcm->mcast_recycle_list_index;
+
+ /*
+ * Make sure that intermediate "frees" don't screw up
+ */
+ b0->clone_count = vec_len (g0->members);
+
+ /* Set up for the recycle node */
+ vnet_buffer(b0)->mcast.mcast_current_index = 1;
+ }
+
+ /* Transmit the pkt on the first interface */
+ next0 = g0->members[0].prep_and_recycle_node_next_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ g0->members[0].tx_sw_if_index;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ mcast_prep_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ t->group_index = vnet_buffer(b0)->mcast.mcast_group_index;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ em->counters[node_counter_base_index + MCAST_PREP_ERROR_MCASTS] +=
+ frame->n_vectors;
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mcast_prep_node) = {
+ .function = mcast_prep_node_fn,
+ .name = "mcast_prep",
+ .vector_size = sizeof (u32),
+ .format_trace = format_mcast_prep_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(mcast_prep_error_strings),
+ .error_strings = mcast_prep_error_strings,
+
+ .n_next_nodes = MCAST_PREP_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [MCAST_PREP_NEXT_DROP] = "error-drop",
+ },
+};
+
+typedef struct {
+ u32 sw_if_index;
+ u32 next_index;
+ u32 current_member;
+ u32 group_index;
+} mcast_recycle_trace_t;
+
+static u8 * format_mcast_recycle_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mcast_recycle_trace_t * t = va_arg (*args, mcast_recycle_trace_t *);
+
+ s = format (s,
+"MCAST_R: group %d, current member %d next (node) index %d, tx_sw_if_index %d",
+ t->group_index, t->current_member, t->next_index, t->sw_if_index);
+ return s;
+}
+
+#define foreach_mcast_recycle_error \
+_(RECYCLES, "Multicast Recycles")
+
+typedef enum {
+#define _(sym,str) MCAST_RECYCLE_ERROR_##sym,
+ foreach_mcast_recycle_error
+#undef _
+ MCAST_RECYCLE_N_ERROR,
+} mcast_recycle_error_t;
+
+static char * mcast_recycle_error_strings[] = {
+#define _(sym,string) string,
+ foreach_mcast_recycle_error
+#undef _
+};
+
+typedef enum {
+ MCAST_RECYCLE_NEXT_DROP,
+ MCAST_RECYCLE_N_NEXT,
+} mcast_recycle_next_t;
+
+static uword
+mcast_recycle_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ mcast_recycle_next_t next_index;
+ mcast_main_t * mcm = &mcast_main;
+ vlib_node_t *n = vlib_get_node (vm, mcast_recycle_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (0 && n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = 0;
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ next1 = 0;
+
+ /* $$$$ your message in this space. Process 2 x pkts */
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ mcast_recycle_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ mcast_recycle_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 current_member0;
+ mcast_group_t * g0;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ g0 = pool_elt_at_index (mcm->groups,
+ vnet_buffer(b0)->mcast.mcast_group_index);
+
+ /* No more replicas? */
+ if (b0->clone_count == 1)
+ {
+ /* Restore the original free list index */
+ b0->free_list_index =
+ vnet_buffer(b0)->mcast.original_free_list_index;
+ }
+ current_member0 = vnet_buffer(b0)->mcast.mcast_current_index;
+
+ next0 =
+ g0->members[current_member0].prep_and_recycle_node_next_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] =
+ g0->members[current_member0].tx_sw_if_index;
+
+ vnet_buffer(b0)->mcast.mcast_current_index =
+ current_member0 + 1;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ mcast_recycle_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->next_index = next0;
+ t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ t->group_index = vnet_buffer(b0)->mcast.mcast_group_index;
+ t->current_member = current_member0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ em->counters[node_counter_base_index + MCAST_RECYCLE_ERROR_RECYCLES] +=
+ frame->n_vectors;
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mcast_recycle_node) = {
+ .function = mcast_recycle_node_fn,
+ .name = "mcast-recycle",
+ .vector_size = sizeof (u32),
+ .format_trace = format_mcast_recycle_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(mcast_recycle_error_strings),
+ .error_strings = mcast_recycle_error_strings,
+
+ .n_next_nodes = MCAST_RECYCLE_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [MCAST_RECYCLE_NEXT_DROP] = "error-drop",
+ },
+};
+
+/*
+ * fish pkts back from the recycle queue/freelist
+ * un-flatten the context chains
+ */
+static void mcast_recycle_callback (vlib_main_t *vm,
+ vlib_buffer_free_list_t * fl)
+{
+ vlib_frame_t * f = 0;
+ u32 n_left_from;
+ u32 n_left_to_next = 0;
+ u32 n_this_frame = 0;
+ u32 * from;
+ u32 * to_next;
+ u32 bi0, pi0;
+ vlib_buffer_t *b0;
+ vlib_buffer_t *bnext0;
+ int i;
+
+ /* aligned, unaligned buffers */
+ for (i = 0; i < 2; i++)
+ {
+ if (i == 0)
+ {
+ from = fl->aligned_buffers;
+ n_left_from = vec_len (from);
+ }
+ else
+ {
+ from = fl->unaligned_buffers;
+ n_left_from = vec_len (from);
+ }
+
+ while (n_left_from > 0)
+ {
+ if (PREDICT_FALSE(n_left_to_next == 0))
+ {
+ if (f)
+ {
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, mcast_recycle_node.index, f);
+ }
+
+ f = vlib_get_frame_to_node (vm, mcast_recycle_node.index);
+ to_next = vlib_frame_vector_args (f);
+ n_left_to_next = VLIB_FRAME_SIZE;
+ n_this_frame = 0;
+ }
+
+ bi0 = from[0];
+ if (PREDICT_TRUE(n_left_from > 1))
+ {
+ pi0 = from[1];
+ vlib_prefetch_buffer_with_index(vm,pi0,LOAD);
+ }
+
+ bnext0 = b0 = vlib_get_buffer (vm, bi0);
+
+ while (bnext0->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ from += 1;
+ n_left_from -= 1;
+ bnext0 = vlib_get_buffer (vm, bnext0->next_buffer);
+ }
+ to_next[0] = bi0;
+
+ if (CLIB_DEBUG > 0)
+ vlib_buffer_set_known_state (vm, bi0, VLIB_BUFFER_KNOWN_ALLOCATED);
+
+ from++;
+ to_next++;
+ n_this_frame++;
+ n_left_to_next--;
+ n_left_from--;
+ }
+ }
+
+ vec_reset_length (fl->aligned_buffers);
+ vec_reset_length (fl->unaligned_buffers);
+
+ if (f)
+ {
+ ASSERT(n_this_frame);
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, mcast_recycle_node.index, f);
+ }
+}
+
+clib_error_t *mcast_init (vlib_main_t *vm)
+{
+ mcast_main_t * mcm = &mcast_main;
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ vlib_buffer_free_list_t * fl;
+
+ mcm->vlib_main = vm;
+ mcm->vnet_main = vnet_get_main();
+ mcm->mcast_recycle_list_index =
+ vlib_buffer_create_free_list (vm, 1024 /* fictional */, "mcast-recycle");
+
+ fl = pool_elt_at_index (bm->buffer_free_list_pool,
+ mcm->mcast_recycle_list_index);
+
+ fl->buffers_added_to_freelist_function = mcast_recycle_callback;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (mcast_init);
+
+
diff --git a/vnet/vnet/mcast/mcast.h b/vnet/vnet/mcast/mcast.h
new file mode 100644
index 00000000000..96e514427c6
--- /dev/null
+++ b/vnet/vnet/mcast/mcast.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_vnet_mcast_h__
+#define __included_vnet_mcast_h__
+
+#include <vnet/vnet.h>
+#include <vlib/buffer.h>
+#include <vlib/buffer_funcs.h>
+
+typedef struct {
+ /* Arrange for both prep and recycle nodes to have identical
+ next indices for a given output interface */
+ u32 prep_and_recycle_node_next_index;
+
+ /* Show command, etc. */
+ u32 tx_sw_if_index;
+} mcast_group_member_t;
+
+typedef struct {
+ /* vector of group members */
+ mcast_group_member_t * members;
+} mcast_group_t;
+
+typedef struct {
+ /* pool of multicast (interface) groups */
+ mcast_group_t * groups;
+
+ /* multicast "free" list, aka recycle list */
+ u32 mcast_recycle_list_index;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} mcast_main_t;
+
+mcast_main_t mcast_main;
+
+#endif /* __included_vnet_mcast_h__ */
diff --git a/vnet/vnet/mcast/mcast_test.c b/vnet/vnet/mcast/mcast_test.c
new file mode 100644
index 00000000000..4561d7cdc00
--- /dev/null
+++ b/vnet/vnet/mcast/mcast_test.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/mcast/mcast.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/lookup.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/icmp46_packet.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/mcast/mcast.h>
+
+typedef struct {
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ mcast_main_t * mcast_main;
+} mcast_test_main_t;
+
+mcast_test_main_t mcast_test_main;
+vlib_node_registration_t mcast_prep_node;
+vlib_node_registration_t mcast_recycle_node;
+
+static clib_error_t *
+mcast_test_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u8 *rewrite_data;
+ mcast_test_main_t * mtm = &mcast_test_main;
+ mcast_main_t * mcm = mtm->mcast_main;
+ ip_adjacency_t adj;
+ u32 adj_index;
+ mcast_group_t * g;
+ mcast_group_member_t * member;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t dst_addr, zero;
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ pool_get (mcm->groups, g);
+ memset (g, 0, sizeof (*g));
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ vnet_hw_interface_t *hw;
+ u32 next, sw_if_index;
+
+ if (unformat (line_input, "%U", unformat_vnet_sw_interface,
+ mtm->vnet_main, &sw_if_index))
+ {
+ vec_add2 (g->members, member, 1);
+ member->tx_sw_if_index = sw_if_index;
+
+ hw = vnet_get_sup_hw_interface (mtm->vnet_main,
+ sw_if_index);
+
+ next = vlib_node_add_next (mtm->vlib_main,
+ mcast_prep_node.index,
+ hw->output_node_index);
+
+ /* Required to be the same next index... */
+ vlib_node_add_next_with_slot (mtm->vlib_main,
+ mcast_recycle_node.index,
+ hw->output_node_index, next);
+ member->prep_and_recycle_node_next_index = next;
+ }
+ else
+ {
+ return unformat_parse_error (line_input);
+ }
+ }
+
+ if (vec_len (g->members) == 0)
+ {
+ pool_put (mcm->groups, g);
+ vlib_cli_output (vm, "no group members specified");
+ return 0;
+ }
+
+
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj.mcast_group_index = g - mcm->groups;
+ rewrite_data = format (0, "abcdefg");
+
+ vnet_rewrite_for_tunnel
+ (mtm->vnet_main,
+ (u32)~0, /* tx_sw_if_index, we dont know yet */
+ ip4_rewrite_node.index,
+ mcast_prep_node.index,
+ &adj.rewrite_header,
+ rewrite_data, vec_len(rewrite_data));
+
+ ip_add_adjacency (lm, &adj, 1 /* one adj */,
+ &adj_index);
+
+ dst_addr.as_u32 = clib_host_to_net_u32 (0x0a000002);
+ zero.as_u32 = 0;
+
+ ip4_add_del_route_next_hop (im,
+ IP4_ROUTE_FLAG_ADD,
+ &dst_addr,
+ 24 /* mask width */,
+ &zero /* no next hop */,
+
+ 0, // next hop sw if index
+ 1, // weight
+ adj_index,
+ 0 /* explicit fib 0 */);
+
+ return 0;
+}
+
+static VLIB_CLI_COMMAND (mcast_test_command) = {
+ .path = "test mc",
+ .short_help = "test mc",
+ .function = mcast_test_command_fn,
+};
+
+clib_error_t *mcast_test_init (vlib_main_t *vm)
+{
+ mcast_test_main_t * mtm = &mcast_test_main;
+
+ mtm->vlib_main = vm;
+ mtm->vnet_main = vnet_get_main();
+ mtm->mcast_main = &mcast_main;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (mcast_test_init);
diff --git a/vnet/vnet/misc.c b/vnet/vnet/misc.c
new file mode 100644
index 00000000000..6effe6eae3e
--- /dev/null
+++ b/vnet/vnet/misc.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * misc.c: vnet misc
+ *
+ * Copyright (c) 2012 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+
+vnet_main_t vnet_main;
+
+vnet_main_t *
+vnet_get_main (void)
+{
+ return &vnet_main;
+}
+
+static uword
+vnet_local_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * f)
+{
+ ASSERT (0);
+ return f->n_vectors;
+}
+
+VNET_DEVICE_CLASS (vnet_local_interface_device_class,static) = {
+ .name = "local",
+ .tx_function = vnet_local_interface_tx,
+};
+
+VNET_HW_INTERFACE_CLASS (vnet_local_interface_hw_class,static) = {
+ .name = "local",
+};
+
+clib_error_t *
+vnet_main_init (vlib_main_t * vm)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ clib_error_t * error;
+ u32 hw_if_index;
+ vnet_hw_interface_t * hw;
+
+ if ((error = vlib_call_init_function (vm, vnet_interface_init)))
+ return error;
+
+ vnm->vlib_main = vm;
+
+ hw_if_index = vnet_register_interface
+ (vnm,
+ vnet_local_interface_device_class.index, /* instance */ 0,
+ vnet_local_interface_hw_class.index, /* instance */ 0);
+ hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+ vnm->local_interface_hw_if_index = hw_if_index;
+ vnm->local_interface_sw_if_index = hw->sw_if_index;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (vnet_main_init);
diff --git a/vnet/vnet/mpls-gre/error.def b/vnet/vnet/mpls-gre/error.def
new file mode 100644
index 00000000000..424ab50a030
--- /dev/null
+++ b/vnet/vnet/mpls-gre/error.def
@@ -0,0 +1,28 @@
+/*
+ * mpls_error.def: mpls errors
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+mpls_error (NONE, "no error")
+mpls_error (UNKNOWN_PROTOCOL, "unknown protocol")
+mpls_error (UNSUPPORTED_VERSION, "unsupported version")
+mpls_error (PKTS_DECAP, "MPLS-GRE input packets decapsulated")
+mpls_error (PKTS_ENCAP, "MPLS-GRE output packets encapsulated")
+mpls_error (NO_LABEL, "MPLS-GRE no label for fib/dst")
+mpls_error (TTL_EXPIRED, "MPLS-GRE ttl expired")
+mpls_error (S_NOT_SET, "MPLS-GRE s-bit not set")
+mpls_error (BAD_LABEL, "invalid FIB id in label")
+mpls_error (NOT_IP4, "non-ip4 packets dropped")
+mpls_error (DISALLOWED_FIB, "disallowed FIB id")
diff --git a/vnet/vnet/mpls-gre/interface.c b/vnet/vnet/mpls-gre/interface.c
new file mode 100644
index 00000000000..c345054bdec
--- /dev/null
+++ b/vnet/vnet/mpls-gre/interface.c
@@ -0,0 +1,1930 @@
+/*
+ * interface.c: mpls interfaces
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/gre/gre.h>
+#include <vnet/mpls-gre/mpls.h>
+
+static uword mpls_gre_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ /*
+ * Conundrum: packets from tun/tap destined for the tunnel
+ * actually have this rewrite applied. Transit packets do not.
+ * To make the two cases equivalent, don't generate a
+ * rewrite here, build the entire header in the fast path.
+ */
+ return 0;
+}
+
+/* manually added to the interface output node */
+#define MPLS_GRE_OUTPUT_NEXT_POST_REWRITE 1
+
+static uword
+mpls_gre_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ mpls_main_t * gm = &mpls_main;
+ vnet_main_t * vnm = gm->vnet_main;
+ u32 next_index;
+ u32 * from, * to_next, n_left_from, n_left_to_next;
+
+ /* Vector of buffer / pkt indices we're supposed to process */
+ from = vlib_frame_vector_args (frame);
+
+ /* Number of buffers / pkts */
+ n_left_from = frame->n_vectors;
+
+ /* Speculatively send the first buffer to the last disposition we used */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ /* set up to enqueue to our disposition with index = next_index */
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ /*
+ * As long as we have enough pkts left to process two pkts
+ * and prefetch two pkts...
+ */
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, next0, bi1, next1;
+ mpls_gre_tunnel_t * t0, * t1;
+ u32 sw_if_index0, sw_if_index1;
+ vnet_hw_interface_t * hi0, * hi1;
+ u8 * dst0, * dst1;
+
+ /* Prefetch the next iteration */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ /*
+ * Prefetch packet data. We expect to overwrite
+ * the inbound L2 header with an ip header and a
+ * gre header. Might want to prefetch the last line
+ * of rewrite space as well; need profile data
+ */
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* Pick up the next two buffer indices */
+ bi0 = from[0];
+ bi1 = from[1];
+
+ /* Speculatively enqueue them where we sent the last buffer */
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index [VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index [VLIB_TX];
+
+ /* get h/w intfcs */
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+ hi1 = vnet_get_sup_hw_interface (vnm, sw_if_index1);
+
+ /* hw_instance = tunnel pool index */
+ t0 = pool_elt_at_index (gm->gre_tunnels, hi0->hw_instance);
+ t1 = pool_elt_at_index (gm->gre_tunnels, hi1->hw_instance);
+
+ /* Apply rewrite - $$$$$ fixme don't use memcpy */
+ vlib_buffer_advance (b0, -(word)vec_len(t0->rewrite_data));
+ vlib_buffer_advance (b1, -(word)vec_len(t1->rewrite_data));
+
+ dst0 = vlib_buffer_get_current (b0);
+ dst1 = vlib_buffer_get_current (b1);
+
+ memcpy (dst0, t0->rewrite_data, vec_len(t0->rewrite_data));
+ memcpy (dst1, t1->rewrite_data, vec_len(t1->rewrite_data));
+
+ /* Fix TX fib indices */
+ vnet_buffer(b0)->sw_if_index [VLIB_TX] = t0->outer_fib_index;
+ vnet_buffer(b1)->sw_if_index [VLIB_TX] = t1->outer_fib_index;
+
+ /* mpls-post-rewrite takes it from here... */
+ next0 = MPLS_GRE_OUTPUT_NEXT_POST_REWRITE;
+ next1 = MPLS_GRE_OUTPUT_NEXT_POST_REWRITE;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_gre_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_id = t0 - gm->gre_tunnels;
+ tr->length = b0->current_length;
+ tr->src.as_u32 = t0->tunnel_src.as_u32;
+ tr->dst.as_u32 = t0->tunnel_dst.as_u32;
+ tr->lookup_miss = 0;
+ tr->mpls_encap_index = t0->encap_index;
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_gre_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->tunnel_id = t1 - gm->gre_tunnels;
+ tr->length = b1->current_length;
+ tr->src.as_u32 = t1->tunnel_src.as_u32;
+ tr->dst.as_u32 = t1->tunnel_dst.as_u32;
+ tr->lookup_miss = 0;
+ tr->mpls_encap_index = t1->encap_index;
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0, next0;
+ mpls_gre_tunnel_t * t0;
+ u32 sw_if_index0;
+ vnet_hw_interface_t * hi0;
+ u8 * dst0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index [VLIB_TX];
+
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ t0 = pool_elt_at_index (gm->gre_tunnels, hi0->hw_instance);
+
+ /* Apply rewrite - $$$$$ fixme don't use memcpy */
+ vlib_buffer_advance (b0, -(word)vec_len(t0->rewrite_data));
+
+ dst0 = vlib_buffer_get_current (b0);
+
+ memcpy (dst0, t0->rewrite_data, vec_len(t0->rewrite_data));
+
+ /* Fix the TX fib index */
+ vnet_buffer(b0)->sw_if_index [VLIB_TX] = t0->outer_fib_index;
+
+ /* mpls-post-rewrite takes it from here... */
+ next0 = MPLS_GRE_OUTPUT_NEXT_POST_REWRITE;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_gre_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_id = t0 - gm->gre_tunnels;
+ tr->length = b0->current_length;
+ tr->src.as_u32 = t0->tunnel_src.as_u32;
+ tr->dst.as_u32 = t0->tunnel_dst.as_u32;
+ tr->lookup_miss = 0;
+ tr->mpls_encap_index = t0->encap_index;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, gre_input_node.index,
+ GRE_ERROR_PKTS_ENCAP, frame->n_vectors);
+
+ return frame->n_vectors;
+}
+
+static u8 * format_mpls_gre_tunnel_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "mpls-gre%d", dev_instance);
+}
+
+static u8 * format_mpls_gre_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ CLIB_UNUSED (int verbose) = va_arg (*args, int);
+
+ s = format (s, "MPLS-GRE tunnel: id %d\n", dev_instance);
+ return s;
+}
+
+VNET_DEVICE_CLASS (mpls_gre_device_class) = {
+ .name = "MPLS-GRE tunnel device",
+ .format_device_name = format_mpls_gre_tunnel_name,
+ .format_device = format_mpls_gre_device,
+ .format_tx_trace = format_mpls_gre_tx_trace,
+ .tx_function = mpls_gre_interface_tx,
+ .no_flatten_output_chains = 1,
+#ifdef SOON
+ .clear counter = 0;
+ .admin_up_down_function = 0;
+#endif
+};
+
+VNET_HW_INTERFACE_CLASS (mpls_gre_hw_interface_class) = {
+ .name = "MPLS-GRE",
+ .format_header = format_mpls_gre_header_with_length,
+#if 0
+ .unformat_header = unformat_mpls_gre_header,
+#endif
+ .set_rewrite = mpls_gre_set_rewrite,
+};
+
+
+static uword mpls_eth_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ /*
+ * Conundrum: packets from tun/tap destined for the tunnel
+ * actually have this rewrite applied. Transit packets do not.
+ * To make the two cases equivalent, don't generate a
+ * rewrite here, build the entire header in the fast path.
+ */
+ return 0;
+}
+
+/* manually added to the interface output node */
+#define MPLS_ETH_OUTPUT_NEXT_OUTPUT 1
+
+static uword
+mpls_eth_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ mpls_main_t * gm = &mpls_main;
+ vnet_main_t * vnm = gm->vnet_main;
+ u32 next_index;
+ u32 * from, * to_next, n_left_from, n_left_to_next;
+
+ /* Vector of buffer / pkt indices we're supposed to process */
+ from = vlib_frame_vector_args (frame);
+
+ /* Number of buffers / pkts */
+ n_left_from = frame->n_vectors;
+
+ /* Speculatively send the first buffer to the last disposition we used */
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ /* set up to enqueue to our disposition with index = next_index */
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ /*
+ * As long as we have enough pkts left to process two pkts
+ * and prefetch two pkts...
+ */
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ vlib_buffer_t * b0, * b1;
+ u32 bi0, next0, bi1, next1;
+ mpls_eth_tunnel_t * t0, * t1;
+ u32 sw_if_index0, sw_if_index1;
+ vnet_hw_interface_t * hi0, * hi1;
+ u8 * dst0, * dst1;
+
+ /* Prefetch the next iteration */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ /*
+ * Prefetch packet data. We expect to overwrite
+ * the inbound L2 header with an ip header and a
+ * gre header. Might want to prefetch the last line
+ * of rewrite space as well; need profile data
+ */
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* Pick up the next two buffer indices */
+ bi0 = from[0];
+ bi1 = from[1];
+
+ /* Speculatively enqueue them where we sent the last buffer */
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index [VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index [VLIB_TX];
+
+ /* get h/w intfcs */
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+ hi1 = vnet_get_sup_hw_interface (vnm, sw_if_index1);
+
+ /* hw_instance = tunnel pool index */
+ t0 = pool_elt_at_index (gm->eth_tunnels, hi0->hw_instance);
+ t1 = pool_elt_at_index (gm->eth_tunnels, hi1->hw_instance);
+
+ /* Apply rewrite - $$$$$ fixme don't use memcpy */
+ vlib_buffer_advance (b0, -(word)vec_len(t0->rewrite_data));
+ vlib_buffer_advance (b1, -(word)vec_len(t1->rewrite_data));
+
+ dst0 = vlib_buffer_get_current (b0);
+ dst1 = vlib_buffer_get_current (b1);
+
+ memcpy (dst0, t0->rewrite_data, vec_len(t0->rewrite_data));
+ memcpy (dst1, t1->rewrite_data, vec_len(t1->rewrite_data));
+
+ /* Fix TX fib indices */
+ vnet_buffer(b0)->sw_if_index [VLIB_TX] = t0->tx_sw_if_index;
+ vnet_buffer(b1)->sw_if_index [VLIB_TX] = t1->tx_sw_if_index;
+
+ /* mpls-post-rewrite takes it from here... */
+ next0 = MPLS_ETH_OUTPUT_NEXT_OUTPUT;
+ next1 = MPLS_ETH_OUTPUT_NEXT_OUTPUT;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_eth_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->lookup_miss = 0;
+ tr->tunnel_id = t0 - gm->eth_tunnels;
+ tr->tx_sw_if_index = t0->tx_sw_if_index;
+ tr->mpls_encap_index = t0->encap_index;
+ tr->length = b0->current_length;
+ hi0 = vnet_get_sup_hw_interface (vnm, t0->tx_sw_if_index);
+ memcpy (tr->dst, hi0->hw_address, sizeof (tr->dst));
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_eth_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->lookup_miss = 0;
+ tr->tunnel_id = t1 - gm->eth_tunnels;
+ tr->tx_sw_if_index = t1->tx_sw_if_index;
+ tr->mpls_encap_index = t1->encap_index;
+ tr->length = b0->current_length;
+ hi1 = vnet_get_sup_hw_interface (vnm, t1->tx_sw_if_index);
+ memcpy (tr->dst, hi1->hw_address, sizeof (tr->dst));
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ vlib_buffer_t * b0;
+ u32 bi0, next0;
+ mpls_eth_tunnel_t * t0;
+ u32 sw_if_index0;
+ vnet_hw_interface_t * hi0;
+ u8 * dst0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index [VLIB_TX];
+
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ t0 = pool_elt_at_index (gm->eth_tunnels, hi0->hw_instance);
+
+ /* Apply rewrite - $$$$$ fixme don't use memcpy */
+ vlib_buffer_advance (b0, -(word)vec_len(t0->rewrite_data));
+
+ dst0 = vlib_buffer_get_current (b0);
+
+ memcpy (dst0, t0->rewrite_data, vec_len(t0->rewrite_data));
+
+ /* Fix the TX interface */
+ vnet_buffer(b0)->sw_if_index [VLIB_TX] = t0->tx_sw_if_index;
+
+ /* Send the packet */
+ next0 = MPLS_ETH_OUTPUT_NEXT_OUTPUT;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_eth_tx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->lookup_miss = 0;
+ tr->tunnel_id = t0 - gm->eth_tunnels;
+ tr->tx_sw_if_index = t0->tx_sw_if_index;
+ tr->mpls_encap_index = t0->encap_index;
+ tr->length = b0->current_length;
+ hi0 = vnet_get_sup_hw_interface (vnm, t0->tx_sw_if_index);
+ memcpy (tr->dst, hi0->hw_address, sizeof (tr->dst));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, mpls_input_node.index,
+ MPLS_ERROR_PKTS_ENCAP, frame->n_vectors);
+
+ return frame->n_vectors;
+}
+
+static u8 * format_mpls_eth_tunnel_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "mpls-eth%d", dev_instance);
+}
+
+static u8 * format_mpls_eth_device (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ CLIB_UNUSED (int verbose) = va_arg (*args, int);
+
+ s = format (s, "MPLS-ETH tunnel: id %d\n", dev_instance);
+ return s;
+}
+
+VNET_DEVICE_CLASS (mpls_eth_device_class) = {
+ .name = "MPLS-ETH tunnel device",
+ .format_device_name = format_mpls_eth_tunnel_name,
+ .format_device = format_mpls_eth_device,
+ .format_tx_trace = format_mpls_eth_tx_trace,
+ .tx_function = mpls_eth_interface_tx,
+ .no_flatten_output_chains = 1,
+#ifdef SOON
+ .clear counter = 0;
+ .admin_up_down_function = 0;
+#endif
+};
+
+
+VNET_HW_INTERFACE_CLASS (mpls_eth_hw_interface_class) = {
+ .name = "MPLS-ETH",
+ .format_header = format_mpls_eth_header_with_length,
+#if 0
+ .unformat_header = unformat_mpls_eth_header,
+#endif
+ .set_rewrite = mpls_eth_set_rewrite,
+};
+
+#define foreach_mpls_post_rewrite_next \
+ _ (IP4_LOOKUP, "ip4-lookup")
+
+typedef enum {
+#define _(s,n) MPLS_POST_REWRITE_NEXT_##s,
+ foreach_mpls_post_rewrite_next
+#undef _
+ MPLS_POST_REWRITE_N_NEXT,
+} mpls_post_rewrite_next_t;
+
+
+static uword
+mpls_post_rewrite (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ u16 old_l0 = 0, old_l1 = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ip4_header_t * ip0, * ip1;
+ u32 next0 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
+ u32 next1 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
+ u16 new_l0, new_l1;
+ ip_csum_t sum0, sum1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b1);
+
+ /* Note: the tunnel rewrite sets up sw_if_index[VLIB_TX] */
+
+ /* set the GRE (outer) ip packet length, fix the bloody checksum */
+ sum0 = ip0->checksum;
+ sum1 = ip1->checksum;
+
+ /* old_l0, old_l1 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+ new_l1 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip1->checksum = ip_csum_fold (sum1);
+ ip0->length = new_l0;
+ ip1->length = new_l1;
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip4_header_t * ip0;
+ u32 next0 = MPLS_POST_REWRITE_NEXT_IP4_LOOKUP;
+ u16 new_l0;
+ ip_csum_t sum0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+
+ /* Note: the tunnel rewrite sets up sw_if_index[VLIB_TX] */
+
+ /* set the GRE (outer) ip packet length, fix the bloody checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, mpls_input_node.index,
+ MPLS_ERROR_PKTS_ENCAP, from_frame->n_vectors);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mpls_post_rewrite_node) = {
+ .function = mpls_post_rewrite,
+ .name = "mpls-post-rewrite",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = 0,
+
+ .n_next_nodes = MPLS_POST_REWRITE_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [MPLS_POST_REWRITE_NEXT_##s] = n,
+ foreach_mpls_post_rewrite_next
+#undef _
+ },
+};
+
+static u8 * mpls_gre_rewrite (mpls_main_t *mm, mpls_gre_tunnel_t * t)
+{
+ ip4_header_t * ip0;
+ ip4_gre_and_mpls_header_t * h0;
+ u8 * rewrite_data = 0;
+ mpls_encap_t * e;
+ mpls_unicast_header_t *lp0;
+ int i;
+
+ /* look up the encap label stack using the RX FIB */
+ e = mpls_encap_by_fib_and_dest (mm, t->inner_fib_index, t->tunnel_dst.as_u32);
+
+ if (e == 0)
+ {
+ clib_warning ("no label for inner fib index %d, dst %U",
+ t->inner_fib_index, format_ip4_address,
+ &t->tunnel_dst);
+ return 0;
+ }
+
+ vec_validate (rewrite_data, sizeof (*h0)
+ + sizeof (mpls_unicast_header_t) * vec_len(e->labels) -1);
+ memset (rewrite_data, 0, sizeof (*h0));
+
+ h0 = (ip4_gre_and_mpls_header_t *) rewrite_data;
+ /* Copy the encap label stack */
+ lp0 = h0->labels;
+ for (i = 0; i < vec_len(e->labels); i++)
+ lp0[i] = e->labels[i];
+ ip0 = &h0->ip4;
+ h0->gre.protocol = clib_host_to_net_u16(GRE_PROTOCOL_mpls_unicast);
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_GRE;
+ /* $$$ fixup ip4 header length and checksum after-the-fact */
+ ip0->src_address.as_u32 = t->tunnel_src.as_u32;
+ ip0->dst_address.as_u32 = t->tunnel_dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ return (rewrite_data);
+}
+
+int vnet_mpls_gre_add_del_tunnel (ip4_address_t *src,
+ ip4_address_t *dst,
+ ip4_address_t *intfc,
+ u32 mask_width,
+ u32 inner_fib_id, u32 outer_fib_id,
+ u32 * tunnel_sw_if_index,
+ u8 l2_only,
+ u8 is_add)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ mpls_main_t * mm = &mpls_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t zero;
+ mpls_gre_tunnel_t *tp;
+ int need_route_add_del = 1;
+ u32 inner_fib_index = 0;
+ u32 outer_fib_index = 0;
+ ip_adjacency_t adj;
+ u32 adj_index;
+ u8 * rewrite_data;
+ int found_tunnel = 0;
+ mpls_encap_t * e = 0;
+ u32 hw_if_index = ~0;
+ vnet_hw_interface_t * hi;
+ u32 slot;
+ u32 dummy;
+
+ zero.as_u32 = 0;
+
+ /* No questions, no answers */
+ if (tunnel_sw_if_index == 0)
+ tunnel_sw_if_index = &dummy;
+
+ *tunnel_sw_if_index = ~0;
+
+ if (inner_fib_id != (u32)~0)
+ {
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, inner_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+ inner_fib_index = p[0];
+ }
+
+ if (outer_fib_id != 0)
+ {
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, outer_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+ outer_fib_index = p[0];
+ }
+
+ /* suppress duplicate mpls interface generation. */
+ pool_foreach (tp, mm->gre_tunnels,
+ ({
+ /*
+ * If we have a tunnel which matches (src, dst, intfc/mask)
+ * AND the expected route is in the FIB, it's a dup
+ */
+ if (!memcmp (&tp->tunnel_src, src, sizeof (*src))
+ && !memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
+ && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
+ && tp->inner_fib_index == inner_fib_index)
+ {
+ ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
+ uword * hash = fib->adj_index_by_dst_address[mask_width];
+ uword key = intfc->as_u32 & im->fib_masks[mask_width];
+ uword *p = hash_get (hash, key);
+
+ found_tunnel = 1;
+
+ if (is_add)
+ {
+ /* A dup, and the route is in the fib. Done */
+ if (p || l2_only)
+ return 1;
+ else
+ {
+ /* Reinstall the route (and other stuff) */
+ e = mpls_encap_by_fib_and_dest (mm, inner_fib_index,
+ dst->as_u32);
+ if (e == 0)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+ goto reinstall_it;
+ }
+ }
+ else
+ {
+ /* Delete, the route is already gone? */
+ if (!p)
+ need_route_add_del = 0;
+ goto add_del_route;
+ }
+
+ }
+ }));
+
+ /* Delete, and we can't find the tunnel */
+ if (is_add == 0 && found_tunnel == 0)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ e = mpls_encap_by_fib_and_dest (mm, inner_fib_index, dst->as_u32);
+ if (e == 0)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+
+ pool_get(mm->gre_tunnels, tp);
+ memset (tp, 0, sizeof (*tp));
+
+ if (vec_len (mm->free_gre_sw_if_indices) > 0)
+ {
+ hw_if_index =
+ mm->free_gre_sw_if_indices[vec_len(mm->free_gre_sw_if_indices)-1];
+ _vec_len (mm->free_gre_sw_if_indices) -= 1;
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = tp - mm->gre_tunnels;
+ hi->hw_instance = tp - mm->gre_tunnels;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, mpls_gre_device_class.index, tp - mm->gre_tunnels,
+ mpls_gre_hw_interface_class.index,
+ tp - mm->gre_tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ /* ... to make the IP and L2 x-connect cases identical */
+ slot = vlib_node_add_named_next_with_slot
+ (vnm->vlib_main, hi->tx_node_index,
+ "mpls-post-rewrite", MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+
+ ASSERT (slot == MPLS_GRE_OUTPUT_NEXT_POST_REWRITE);
+ }
+
+ *tunnel_sw_if_index = hi->sw_if_index;
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ tp->hw_if_index = hw_if_index;
+
+ reinstall_it:
+ tp->tunnel_src.as_u32 = src->as_u32;
+ tp->tunnel_dst.as_u32 = dst->as_u32;
+ tp->intfc_address.as_u32 = intfc->as_u32;
+ tp->mask_width = mask_width;
+ tp->inner_fib_index = inner_fib_index;
+ tp->outer_fib_index = outer_fib_index;
+ tp->encap_index = e - mm->encaps;
+ tp->l2_only = l2_only;
+
+ /* Create the adjacency and add to v4 fib */
+ memset(&adj, 0, sizeof (adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+
+ rewrite_data = mpls_gre_rewrite (mm, tp);
+ if (rewrite_data == 0)
+ {
+ if (*tunnel_sw_if_index != ~0)
+ {
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
+ }
+ pool_put (mm->gre_tunnels, tp);
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+ }
+
+ /* Save a copy of the rewrite data for L2 x-connect */
+ vec_free (tp->rewrite_data);
+
+ tp->rewrite_data = rewrite_data;
+
+ vnet_rewrite_for_tunnel
+ (vnm,
+ outer_fib_index /* tx_sw_if_index, aka outer fib ID */,
+ ip4_rewrite_node.index,
+ mpls_post_rewrite_node.index,
+ &adj.rewrite_header,
+ rewrite_data, vec_len(rewrite_data));
+
+ if (!l2_only)
+ ip_add_adjacency (lm, &adj, 1 /* one adj */,
+ &adj_index);
+
+ add_del_route:
+
+ if (need_route_add_del && !l2_only)
+ {
+ if (is_add)
+ ip4_add_del_route_next_hop (im,
+ IP4_ROUTE_FLAG_ADD,
+ &tp->intfc_address,
+ tp->mask_width,
+ &zero /* no next hop */,
+ (u32)~0 /* next_hop_sw_if_index */,
+ 1 /* weight */,
+ adj_index,
+ tp->inner_fib_index);
+ else
+ {
+ ip4_add_del_route_args_t a;
+ memset (&a, 0, sizeof (a));
+
+ a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
+ a.table_index_or_table_id = tp->inner_fib_index;
+ a.dst_address = tp->intfc_address;
+ a.dst_address_length = tp->mask_width;
+ a.adj_index = ~0;
+
+ ip4_add_del_route (im, &a);
+ ip4_maybe_remap_adjacencies (im, tp->inner_fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX);
+ }
+ }
+
+ if (is_add == 0 && found_tunnel)
+ {
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
+ vec_free (tp->rewrite_data);
+ pool_put (mm->gre_tunnels, tp);
+ }
+
+ return 0;
+}
+
+/*
+ * Remove all mpls tunnels in the specified fib
+ */
+int vnet_mpls_gre_delete_fib_tunnels (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ mpls_main_t * mm = &mpls_main;
+ vnet_main_t * vnm = mm->vnet_main;
+ mpls_gre_tunnel_t *tp;
+ u32 fib_index = 0;
+ uword * p;
+ u32 * tunnels_to_delete = 0;
+ vnet_hw_interface_t * hi;
+ ip4_fib_t * fib;
+ int i;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+ fib_index = p[0];
+
+ pool_foreach (tp, mm->gre_tunnels,
+ ({
+ if (tp->inner_fib_index == fib_index)
+ vec_add1 (tunnels_to_delete, tp - mm->gre_tunnels);
+ }));
+
+ fib = vec_elt_at_index (im->fibs, fib_index);
+
+ for (i = 0; i < vec_len(tunnels_to_delete); i++) {
+ tp = pool_elt_at_index (mm->gre_tunnels, tunnels_to_delete[i]);
+ uword * hash = fib->adj_index_by_dst_address[tp->mask_width];
+ uword key = tp->intfc_address.as_u32 & im->fib_masks[tp->mask_width];
+ uword *p = hash_get (hash, key);
+ ip4_add_del_route_args_t a;
+
+ /* Delete, the route if not already gone */
+ if (p && !tp->l2_only)
+ {
+ memset (&a, 0, sizeof (a));
+ a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
+ a.table_index_or_table_id = tp->inner_fib_index;
+ a.dst_address = tp->intfc_address;
+ a.dst_address_length = tp->mask_width;
+ a.adj_index = ~0;
+ ip4_add_del_route (im, &a);
+ ip4_maybe_remap_adjacencies (im, tp->inner_fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX);
+ }
+
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_gre_sw_if_indices, tp->hw_if_index);
+ vec_free (tp->rewrite_data);
+ pool_put (mm->gre_tunnels, tp);
+ }
+
+ vec_free(tunnels_to_delete);
+
+ return (0);
+}
+
+static clib_error_t *
+create_mpls_gre_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t src, dst, intfc;
+ int src_set = 0, dst_set = 0, intfc_set = 0;
+ u32 mask_width;
+ u32 inner_fib_id = (u32)~0;
+ u32 outer_fib_id = 0;
+ int rv;
+ u8 is_del = 0;
+ u8 l2_only = 0;
+ u32 tunnel_intfc_sw_if_index = ~0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "src %U",
+ unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (line_input, "dst %U",
+ unformat_ip4_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "intfc %U/%d",
+ unformat_ip4_address, &intfc, &mask_width))
+ intfc_set = 1;
+ else if (unformat (line_input, "inner-fib-id %d", &inner_fib_id))
+ ;
+ else if (unformat (line_input, "outer-fib-id %d", &outer_fib_id))
+ ;
+ else if (unformat (line_input, "del"))
+ is_del = 1;
+ else if (unformat (line_input, "l2-only"))
+ l2_only = 1;
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, line_input);
+ }
+
+ if (!src_set)
+ return clib_error_return (0, "missing: src <ip-address>");
+
+ if (!dst_set)
+ return clib_error_return (0, "missing: dst <ip-address>");
+
+ if (!intfc_set)
+ return clib_error_return (0, "missing: intfc <ip-address>/<mask-width>");
+
+
+ rv = vnet_mpls_gre_add_del_tunnel (&src, &dst, &intfc, mask_width,
+ inner_fib_id, outer_fib_id,
+ &tunnel_intfc_sw_if_index,
+ l2_only, !is_del);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_INNER_FIB:
+ return clib_error_return (0, "inner fib ID %d doesn't exist\n",
+ inner_fib_id);
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "outer fib ID %d doesn't exist\n",
+ outer_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel not found\n");
+
+ case VNET_API_ERROR_NO_SUCH_LABEL:
+ /*
+ * This happens when there's no MPLS label for the dst address
+ * no need for two error messages.
+ */
+ break;
+
+ default:
+ return clib_error_return (0, "vnet_mpls_gre_add_del_tunnel returned %d",
+ rv);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_mpls_tunnel_command, static) = {
+ .path = "create mpls gre tunnel",
+ .short_help =
+ "create mpls gre tunnel [del] src <addr> dst <addr> intfc <addr>/<mw>",
+ .function = create_mpls_gre_tunnel_command_fn,
+};
+
+u8 * format_mpls_encap_index (u8 * s, va_list * args)
+{
+ mpls_main_t * mm = va_arg (*args, mpls_main_t *);
+ u32 entry_index = va_arg (*args, u32);
+ mpls_encap_t * e;
+ int i;
+
+ e = pool_elt_at_index (mm->encaps, entry_index);
+
+ for (i = 0; i < vec_len (e->labels); i++)
+ s = format
+ (s, "%d ", vnet_mpls_uc_get_label(clib_net_to_host_u32
+ (e->labels[i].label_exp_s_ttl)));
+
+ return s;
+}
+
+u8 * format_mpls_gre_tunnel (u8 * s, va_list * args)
+{
+ mpls_gre_tunnel_t * t = va_arg (*args, mpls_gre_tunnel_t *);
+ mpls_main_t * mm = &mpls_main;
+
+ if (t->l2_only == 0)
+ {
+ s = format (s, "[%d]: src %U, dst %U, adj %U/%d, labels %U\n",
+ t - mm->gre_tunnels,
+ format_ip4_address, &t->tunnel_src,
+ format_ip4_address, &t->tunnel_dst,
+ format_ip4_address, &t->intfc_address,
+ t->mask_width,
+ format_mpls_encap_index, mm, t->encap_index);
+
+ s = format (s, " inner fib index %d, outer fib index %d",
+ t->inner_fib_index, t->outer_fib_index);
+ }
+ else
+ {
+ s = format (s, "[%d]: src %U, dst %U, key %U, labels %U\n",
+ t - mm->gre_tunnels,
+ format_ip4_address, &t->tunnel_src,
+ format_ip4_address, &t->tunnel_dst,
+ format_ip4_address, &t->intfc_address,
+ format_mpls_encap_index, mm, t->encap_index);
+
+ s = format (s, " l2 interface %d, outer fib index %d",
+ t->hw_if_index, t->outer_fib_index);
+ }
+
+ return s;
+}
+
+u8 * format_mpls_ethernet_tunnel (u8 * s, va_list * args)
+{
+ mpls_eth_tunnel_t * t = va_arg (*args, mpls_eth_tunnel_t *);
+ mpls_main_t * mm = &mpls_main;
+
+ s = format (s, "[%d]: dst %U, adj %U/%d, labels %U\n",
+ t - mm->eth_tunnels,
+ format_ethernet_address, &t->tunnel_dst,
+ format_ip4_address, &t->intfc_address,
+ t->mask_width,
+ format_mpls_encap_index, mm, t->encap_index);
+
+
+ s = format (s, " tx on %U, rx fib index %d",
+ format_vnet_sw_if_index_name, mm->vnet_main, t->tx_sw_if_index,
+ t->inner_fib_index);
+
+ return s;
+}
+
+static clib_error_t *
+show_mpls_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ mpls_main_t * mm = &mpls_main;
+ mpls_gre_tunnel_t * gt;
+ mpls_eth_tunnel_t * et;
+
+ if (pool_elts (mm->gre_tunnels))
+ {
+ vlib_cli_output (vm, "MPLS-GRE tunnels");
+ pool_foreach (gt, mm->gre_tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_mpls_gre_tunnel, gt);
+ }));
+ }
+ else
+ vlib_cli_output (vm, "No MPLS-GRE tunnels");
+
+ if (pool_elts (mm->eth_tunnels))
+ {
+ vlib_cli_output (vm, "MPLS-Ethernet tunnels");
+ pool_foreach (et, mm->eth_tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_mpls_ethernet_tunnel, et);
+ }));
+ }
+ else
+ vlib_cli_output (vm, "No MPLS-Ethernet tunnels");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_mpls_tunnel_command, static) = {
+ .path = "show mpls tunnel",
+ .short_help = "show mpls tunnel",
+ .function = show_mpls_tunnel_command_fn,
+};
+
+/* force inclusion from application's main.c */
+clib_error_t *mpls_interface_init (vlib_main_t *vm)
+{
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, mpls_policy_encap_init)))
+ return error;
+
+ return 0;
+}
+VLIB_INIT_FUNCTION(mpls_interface_init);
+
+
+static u8 * mpls_ethernet_rewrite (mpls_main_t *mm, mpls_eth_tunnel_t * t)
+{
+ u8 * rewrite_data = 0;
+ mpls_encap_t * e;
+ mpls_unicast_header_t *lp0;
+ int i;
+
+ /* look up the encap label stack using the RX FIB and adjacency address*/
+ e = mpls_encap_by_fib_and_dest (mm, t->inner_fib_index,
+ t->intfc_address.as_u32);
+
+ if (e == 0)
+ {
+ clib_warning ("no label for inner fib index %d, dst %U",
+ t->inner_fib_index, format_ip4_address,
+ &t->intfc_address);
+ return 0;
+ }
+
+ vec_validate (rewrite_data,
+ sizeof (mpls_unicast_header_t) * vec_len(e->labels) -1);
+
+ /* Copy the encap label stack */
+ lp0 = (mpls_unicast_header_t *) rewrite_data;
+
+ for (i = 0; i < vec_len(e->labels); i++)
+ lp0[i] = e->labels[i];
+
+ return (rewrite_data);
+}
+
+int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
+ ip4_address_t *intfc,
+ u32 mask_width,
+ u32 inner_fib_id,
+ u32 tx_sw_if_index,
+ u32 * tunnel_sw_if_index,
+ u8 l2_only,
+ u8 is_add)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ mpls_main_t * mm = &mpls_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t zero;
+ mpls_eth_tunnel_t *tp;
+ int need_route_add_del = 1;
+ u32 inner_fib_index = 0;
+ ip_adjacency_t adj;
+ u32 adj_index;
+ u8 * rewrite_data;
+ int found_tunnel = 0;
+ mpls_encap_t * e = 0;
+ u32 hw_if_index = ~0;
+ vnet_hw_interface_t * hi;
+ u32 slot;
+ u32 dummy;
+
+ zero.as_u32 = 0;
+
+ if (tunnel_sw_if_index == 0)
+ tunnel_sw_if_index = &dummy;
+
+ *tunnel_sw_if_index = ~0;
+
+ if (inner_fib_id != (u32)~0)
+ {
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, inner_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+ inner_fib_index = p[0];
+ }
+
+ /* suppress duplicate mpls interface generation. */
+ pool_foreach (tp, mm->eth_tunnels,
+ ({
+ /*
+ * If we have a tunnel which matches (src, dst, intfc/mask)
+ * AND the expected route is in the FIB, it's a dup
+ */
+ if (!memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
+ && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
+ && tp->inner_fib_index == inner_fib_index)
+ {
+ ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
+ uword * hash = fib->adj_index_by_dst_address[mask_width];
+ uword key = intfc->as_u32 & im->fib_masks[mask_width];
+ uword *p = hash_get (hash, key);
+
+ found_tunnel = 1;
+
+ if (is_add)
+ {
+ if (p || l2_only)
+ return 1;
+ else
+ {
+ e = mpls_encap_by_fib_and_dest (mm, inner_fib_index,
+ intfc->as_u32);
+ if (e == 0)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+
+ goto reinstall_it;
+ }
+ }
+ else
+ {
+ /* Delete, the route is already gone? */
+ if (!p)
+ need_route_add_del = 0;
+ goto add_del_route;
+ }
+
+ }
+ }));
+
+ /* Delete, and we can't find the tunnel */
+ if (is_add == 0 && found_tunnel == 0)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ e = mpls_encap_by_fib_and_dest (mm, inner_fib_index, intfc->as_u32);
+ if (e == 0)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+
+ pool_get(mm->eth_tunnels, tp);
+ memset (tp, 0, sizeof (*tp));
+
+ if (vec_len (mm->free_eth_sw_if_indices) > 0)
+ {
+ hw_if_index =
+ mm->free_eth_sw_if_indices[vec_len(mm->free_eth_sw_if_indices)-1];
+ _vec_len (mm->free_eth_sw_if_indices) -= 1;
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = tp - mm->eth_tunnels;
+ hi->hw_instance = tp - mm->eth_tunnels;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, mpls_eth_device_class.index, tp - mm->eth_tunnels,
+ mpls_eth_hw_interface_class.index,
+ tp - mm->eth_tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ /* ... to make the IP and L2 x-connect cases identical */
+ slot = vlib_node_add_named_next_with_slot
+ (vnm->vlib_main, hi->tx_node_index,
+ "interface-output", MPLS_ETH_OUTPUT_NEXT_OUTPUT);
+
+ ASSERT (slot == MPLS_ETH_OUTPUT_NEXT_OUTPUT);
+ }
+
+ *tunnel_sw_if_index = hi->sw_if_index;
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ tp->hw_if_index = hw_if_index;
+
+ reinstall_it:
+ memcpy(tp->tunnel_dst, dst, sizeof (tp->tunnel_dst));
+ tp->intfc_address.as_u32 = intfc->as_u32;
+ tp->mask_width = mask_width;
+ tp->inner_fib_index = inner_fib_index;
+ tp->encap_index = e - mm->encaps;
+ tp->tx_sw_if_index = tx_sw_if_index;
+ tp->l2_only = l2_only;
+
+ /* Create the adjacency and add to v4 fib */
+ memset(&adj, 0, sizeof (adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+
+ rewrite_data = mpls_ethernet_rewrite (mm, tp);
+ if (rewrite_data == 0)
+ {
+ if (*tunnel_sw_if_index != ~0)
+ {
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_eth_sw_if_indices, tp->hw_if_index);
+ }
+
+ pool_put (mm->eth_tunnels, tp);
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+ }
+
+ vnet_rewrite_for_sw_interface
+ (vnm,
+ VNET_L3_PACKET_TYPE_MPLS_UNICAST,
+ tx_sw_if_index,
+ ip4_rewrite_node.index,
+ tp->tunnel_dst,
+ &adj.rewrite_header,
+ sizeof (adj.rewrite_data));
+
+ /*
+ * Prepend the (0,1,2) VLAN tag ethernet header
+ * we just built to the mpls header stack
+ */
+ vec_insert (rewrite_data, adj.rewrite_header.data_bytes, 0);
+ memcpy(rewrite_data,
+ vnet_rewrite_get_data_internal(&adj.rewrite_header,
+ sizeof (adj.rewrite_data)),
+ adj.rewrite_header.data_bytes);
+
+ vnet_rewrite_set_data_internal (&adj.rewrite_header,
+ sizeof(adj.rewrite_data),
+ rewrite_data,
+ vec_len(rewrite_data));
+
+ vec_free (tp->rewrite_data);
+
+ tp->rewrite_data = rewrite_data;
+
+ if (!l2_only)
+ ip_add_adjacency (lm, &adj, 1 /* one adj */,
+ &adj_index);
+
+ add_del_route:
+
+ if (need_route_add_del && !l2_only)
+ {
+ if (is_add)
+ ip4_add_del_route_next_hop (im,
+ IP4_ROUTE_FLAG_ADD,
+ &tp->intfc_address,
+ tp->mask_width,
+ &zero /* no next hop */,
+ (u32)~0 /* next_hop_sw_if_index */,
+ 1 /* weight */,
+ adj_index,
+ tp->inner_fib_index);
+ else
+ {
+ ip4_add_del_route_args_t a;
+ memset (&a, 0, sizeof (a));
+
+ a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
+ a.table_index_or_table_id = tp->inner_fib_index;
+ a.dst_address = tp->intfc_address;
+ a.dst_address_length = tp->mask_width;
+ a.adj_index = ~0;
+
+ ip4_add_del_route (im, &a);
+ ip4_maybe_remap_adjacencies (im, tp->inner_fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX);
+ }
+ }
+ if (is_add == 0 && found_tunnel)
+ {
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_eth_sw_if_indices, tp->hw_if_index);
+ vec_free (tp->rewrite_data);
+ pool_put (mm->eth_tunnels, tp);
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+create_mpls_ethernet_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t intfc;
+ int adj_set = 0;
+ u8 dst[6];
+ int dst_set = 0, intfc_set = 0;
+ u32 mask_width;
+ u32 inner_fib_id = (u32)~0;
+ int rv;
+ u8 is_del = 0;
+ u8 l2_only = 0;
+ u32 tx_sw_if_index;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "dst %U",
+ unformat_ethernet_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "adj %U/%d",
+ unformat_ip4_address, &intfc, &mask_width))
+ adj_set = 1;
+ else if (unformat (line_input, "tx-intfc %U",
+ unformat_vnet_sw_interface, vnm, &tx_sw_if_index))
+ intfc_set = 1;
+ else if (unformat (line_input, "fib-id %d", &inner_fib_id))
+ ;
+ else if (unformat (line_input, "l2-only"))
+ l2_only = 1;
+ else if (unformat (line_input, "del"))
+ is_del = 1;
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, line_input);
+ }
+
+ if (!intfc_set)
+ return clib_error_return (0, "missing tx-intfc");
+
+ if (!dst_set)
+ return clib_error_return (0, "missing: dst <ethernet-address>");
+
+ if (!adj_set)
+ return clib_error_return (0, "missing: intfc <ip-address>/<mask-width>");
+
+
+ rv = vnet_mpls_ethernet_add_del_tunnel (dst, &intfc, mask_width,
+ inner_fib_id, tx_sw_if_index,
+ 0 /* tunnel sw_if_index */,
+ l2_only, !is_del);
+
+ switch (rv)
+ {
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "rx fib ID %d doesn't exist\n",
+ inner_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel not found\n");
+
+ case VNET_API_ERROR_NO_SUCH_LABEL:
+ /*
+ * This happens when there's no MPLS label for the dst address
+ * no need for two error messages.
+ */
+ return clib_error_return (0, "no label for %U in fib %d",
+ format_ip4_address, &intfc, inner_fib_id);
+ break;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+
+VLIB_CLI_COMMAND (create_mpls_ethernet_tunnel_command, static) = {
+ .path = "create mpls ethernet tunnel",
+ .short_help =
+ "create mpls ethernet tunnel [del] dst <mac-addr> intfc <addr>/<mw>",
+ .function = create_mpls_ethernet_tunnel_command_fn,
+};
+
+
+int vnet_mpls_policy_tunnel_add_rewrite (mpls_main_t * mm,
+ mpls_encap_t * e,
+ u32 policy_tunnel_index)
+{
+ mpls_eth_tunnel_t * t;
+ ip_adjacency_t adj;
+ u8 * rewrite_data = 0;
+ u8 * label_start;
+ mpls_unicast_header_t *lp;
+ int i;
+
+ if (pool_is_free_index (mm->eth_tunnels, policy_tunnel_index))
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ t = pool_elt_at_index (mm->eth_tunnels, policy_tunnel_index);
+
+ memset (&adj, 0, sizeof (adj));
+
+ /* Build L2 encap */
+ vnet_rewrite_for_sw_interface
+ (mm->vnet_main,
+ VNET_L3_PACKET_TYPE_MPLS_UNICAST,
+ t->tx_sw_if_index,
+ mpls_policy_encap_node.index,
+ t->tunnel_dst,
+ &adj.rewrite_header,
+ sizeof (adj.rewrite_data));
+
+ vec_validate (rewrite_data, adj.rewrite_header.data_bytes -1);
+
+ memcpy(rewrite_data,
+ vnet_rewrite_get_data_internal(&adj.rewrite_header,
+ sizeof (adj.rewrite_data)),
+ adj.rewrite_header.data_bytes);
+
+ /* Append the label stack */
+
+ vec_add2 (rewrite_data, label_start, vec_len(e->labels) * sizeof (u32));
+
+ lp = (mpls_unicast_header_t *) label_start;
+
+ for (i = 0; i < vec_len(e->labels); i++)
+ lp[i] = e->labels[i];
+
+ /* Remember the rewrite data */
+ e->rewrite = rewrite_data;
+ e->output_next_index = adj.rewrite_header.next_index;
+
+ return 0;
+}
+
+int vnet_mpls_ethernet_add_del_policy_tunnel (u8 *dst,
+ ip4_address_t *intfc,
+ u32 mask_width,
+ u32 inner_fib_id,
+ u32 tx_sw_if_index,
+ u32 * tunnel_sw_if_index,
+ u32 classify_table_index,
+ u32 * new_tunnel_index,
+ u8 l2_only,
+ u8 is_add)
+{
+ ip4_main_t * im = &ip4_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ mpls_main_t * mm = &mpls_main;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t zero;
+ mpls_eth_tunnel_t *tp;
+ int need_route_add_del = 1;
+ u32 inner_fib_index = 0;
+ ip_adjacency_t adj;
+ u32 adj_index;
+ int found_tunnel = 0;
+ mpls_encap_t * e = 0;
+ u32 hw_if_index = ~0;
+ vnet_hw_interface_t * hi;
+ u32 slot;
+ u32 dummy;
+
+ zero.as_u32 = 0;
+
+ if (tunnel_sw_if_index == 0)
+ tunnel_sw_if_index = &dummy;
+
+ *tunnel_sw_if_index = ~0;
+
+ if (inner_fib_id != (u32)~0)
+ {
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, inner_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+ inner_fib_index = p[0];
+ }
+
+ /* suppress duplicate mpls interface generation. */
+ pool_foreach (tp, mm->eth_tunnels,
+ ({
+ /*
+ * If we have a tunnel which matches (src, dst, intfc/mask)
+ * AND the expected route is in the FIB, it's a dup
+ */
+ if (!memcmp (&tp->tunnel_dst, dst, sizeof (*dst))
+ && !memcmp (&tp->intfc_address, intfc, sizeof (*intfc))
+ && tp->inner_fib_index == inner_fib_index)
+ {
+ ip4_fib_t * fib = vec_elt_at_index (im->fibs, inner_fib_index);
+ uword * hash = fib->adj_index_by_dst_address[mask_width];
+ uword key = intfc->as_u32 & im->fib_masks[mask_width];
+ uword *p = hash_get (hash, key);
+
+ found_tunnel = 1;
+
+ if (is_add)
+ {
+ if (p || l2_only)
+ return 1;
+ else
+ {
+ goto reinstall_it;
+ }
+ }
+ else
+ {
+ /* Delete, the route is already gone? */
+ if (!p)
+ need_route_add_del = 0;
+ goto add_del_route;
+ }
+
+ }
+ }));
+
+ /* Delete, and we can't find the tunnel */
+ if (is_add == 0 && found_tunnel == 0)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ pool_get(mm->eth_tunnels, tp);
+ memset (tp, 0, sizeof (*tp));
+
+ if (vec_len (mm->free_eth_sw_if_indices) > 0)
+ {
+ hw_if_index =
+ mm->free_eth_sw_if_indices[vec_len(mm->free_eth_sw_if_indices)-1];
+ _vec_len (mm->free_eth_sw_if_indices) -= 1;
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = tp - mm->eth_tunnels;
+ hi->hw_instance = tp - mm->eth_tunnels;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, mpls_eth_device_class.index, tp - mm->eth_tunnels,
+ mpls_eth_hw_interface_class.index,
+ tp - mm->eth_tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ /* ... to make the IP and L2 x-connect cases identical */
+ slot = vlib_node_add_named_next_with_slot
+ (vnm->vlib_main, hi->tx_node_index,
+ "interface-output", MPLS_ETH_OUTPUT_NEXT_OUTPUT);
+
+ ASSERT (slot == MPLS_ETH_OUTPUT_NEXT_OUTPUT);
+ }
+
+ *tunnel_sw_if_index = hi->sw_if_index;
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+
+ tp->hw_if_index = hw_if_index;
+
+ reinstall_it:
+ memcpy(tp->tunnel_dst, dst, sizeof (tp->tunnel_dst));
+ tp->intfc_address.as_u32 = intfc->as_u32;
+ tp->mask_width = mask_width;
+ tp->inner_fib_index = inner_fib_index;
+ tp->encap_index = e - mm->encaps;
+ tp->tx_sw_if_index = tx_sw_if_index;
+ tp->l2_only = l2_only;
+
+ if (new_tunnel_index)
+ *new_tunnel_index = tp - mm->eth_tunnels;
+
+ /* Create the classify adjacency and add to v4 fib */
+ memset(&adj, 0, sizeof (adj));
+ adj.explicit_fib_index = ~0;
+ adj.lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY;
+ adj.classify_table_index = classify_table_index;
+
+ if (!l2_only)
+ ip_add_adjacency (lm, &adj, 1 /* one adj */,
+ &adj_index);
+
+ add_del_route:
+
+ if (need_route_add_del && !l2_only)
+ {
+ if (is_add)
+ ip4_add_del_route_next_hop (im,
+ IP4_ROUTE_FLAG_ADD,
+ &tp->intfc_address,
+ tp->mask_width,
+ &zero /* no next hop */,
+ (u32)~0 /* next_hop_sw_if_index */,
+ 1 /* weight */,
+ adj_index,
+ tp->inner_fib_index);
+ else
+ {
+ ip4_add_del_route_args_t a;
+ memset (&a, 0, sizeof (a));
+
+ a.flags = IP4_ROUTE_FLAG_FIB_INDEX | IP4_ROUTE_FLAG_DEL;
+ a.table_index_or_table_id = tp->inner_fib_index;
+ a.dst_address = tp->intfc_address;
+ a.dst_address_length = tp->mask_width;
+ a.adj_index = ~0;
+
+ ip4_add_del_route (im, &a);
+ ip4_maybe_remap_adjacencies (im, tp->inner_fib_index,
+ IP4_ROUTE_FLAG_FIB_INDEX);
+ }
+ }
+ if (is_add == 0 && found_tunnel)
+ {
+ hi = vnet_get_hw_interface (vnm, tp->hw_if_index);
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ 0 /* admin down */);
+ vec_add1 (mm->free_eth_sw_if_indices, tp->hw_if_index);
+ pool_put (mm->eth_tunnels, tp);
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+create_mpls_ethernet_policy_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ vnet_main_t * vnm = vnet_get_main();
+ ip4_address_t intfc;
+ int adj_set = 0;
+ u8 dst[6];
+ int dst_set = 0, intfc_set = 0;
+ u32 mask_width;
+ u32 inner_fib_id = (u32)~0;
+ u32 classify_table_index = (u32)~0;
+ u32 new_tunnel_index;
+ int rv;
+ u8 is_del = 0;
+ u8 l2_only = 0;
+ u32 tx_sw_if_index;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "dst %U",
+ unformat_ethernet_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "adj %U/%d",
+ unformat_ip4_address, &intfc, &mask_width))
+ adj_set = 1;
+ else if (unformat (line_input, "tx-intfc %U",
+ unformat_vnet_sw_interface, vnm, &tx_sw_if_index))
+ intfc_set = 1;
+ else if (unformat (line_input, "classify-table-index %d",
+ &classify_table_index))
+ ;
+ else if (unformat (line_input, "fib-id %d", &inner_fib_id))
+ ;
+ else if (unformat (line_input, "l2-only"))
+ l2_only = 1;
+ else if (unformat (line_input, "del"))
+ is_del = 1;
+ else
+ return clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, line_input);
+ }
+
+ if (classify_table_index == ~0)
+ return clib_error_return (0, "missing classify_table_index");
+
+ if (!intfc_set)
+ return clib_error_return (0, "missing tx-intfc");
+
+ if (!dst_set)
+ return clib_error_return (0, "missing: dst <ethernet-address>");
+
+ if (!adj_set)
+ return clib_error_return (0, "missing: intfc <ip-address>/<mask-width>");
+
+
+ rv = vnet_mpls_ethernet_add_del_policy_tunnel (dst, &intfc, mask_width,
+ inner_fib_id, tx_sw_if_index,
+ 0 /* tunnel sw_if_index */,
+ classify_table_index,
+ &new_tunnel_index,
+ l2_only, !is_del);
+ switch (rv)
+ {
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "rx fib ID %d doesn't exist\n",
+ inner_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel not found\n");
+
+ case VNET_API_ERROR_NO_SUCH_LABEL:
+ /*
+ * This happens when there's no MPLS label for the dst address
+ * no need for two error messages.
+ */
+ return clib_error_return (0, "no label for %U in fib %d",
+ format_ip4_address, &intfc, inner_fib_id);
+ break;
+
+ default:
+ break;
+ }
+
+ if (!is_del)
+ vlib_cli_output (vm, "tunnel index %d", new_tunnel_index);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_mpls_ethernet_policy_tunnel_command, static) = {
+ .path = "create mpls ethernet policy tunnel",
+ .short_help =
+ "create mpls ethernet policy tunnel [del] dst <mac-addr> intfc <addr>/<mw>\n"
+ " classify-table-index <nn>",
+ .function = create_mpls_ethernet_policy_tunnel_command_fn,
+};
diff --git a/vnet/vnet/mpls-gre/mpls.c b/vnet/vnet/mpls-gre/mpls.c
new file mode 100644
index 00000000000..431a69b4ab0
--- /dev/null
+++ b/vnet/vnet/mpls-gre/mpls.c
@@ -0,0 +1,769 @@
+/*
+ * mpls.c: mpls
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/mpls-gre/mpls.h>
+
+mpls_main_t mpls_main;
+
+u8 * format_mpls_gre_tx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mpls_gre_tx_trace_t * t = va_arg (*args, mpls_gre_tx_trace_t *);
+ mpls_main_t * mm = &mpls_main;
+
+ if (t->lookup_miss)
+ s = format (s, "MPLS: lookup miss");
+ else
+ {
+ s = format (s, "MPLS: tunnel %d labels %U len %d src %U dst %U",
+ t->tunnel_id,
+ format_mpls_encap_index, mm, t->mpls_encap_index,
+ clib_net_to_host_u16 (t->length),
+ format_ip4_address, &t->src.as_u8,
+ format_ip4_address, &t->dst.as_u8);
+ }
+ return s;
+}
+
+u8 * format_mpls_eth_tx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mpls_eth_tx_trace_t * t = va_arg (*args, mpls_eth_tx_trace_t *);
+ mpls_main_t * mm = &mpls_main;
+
+ if (t->lookup_miss)
+ s = format (s, "MPLS: lookup miss");
+ else
+ {
+ s = format (s, "MPLS: tunnel %d labels %U len %d tx_sw_index %d dst %U",
+ t->tunnel_id,
+ format_mpls_encap_index, mm, t->mpls_encap_index,
+ clib_net_to_host_u16 (t->length),
+ t->tx_sw_if_index,
+ format_ethernet_address, t->dst);
+ }
+ return s;
+}
+
+u8 * format_mpls_eth_header_with_length (u8 * s, va_list * args)
+{
+ ethernet_header_t * h = va_arg (*args, ethernet_header_t *);
+ mpls_unicast_header_t * m = (mpls_unicast_header_t *)(h+1);
+ u32 max_header_bytes = va_arg (*args, u32);
+ uword header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "ethernet header truncated");
+
+ s = format
+ (s, "ETHERNET-MPLS label %d",
+ vnet_mpls_uc_get_label (clib_net_to_host_u32 (m->label_exp_s_ttl)));
+
+ return s;
+}
+
+u8 * format_mpls_gre_header_with_length (u8 * s, va_list * args)
+{
+ gre_header_t * h = va_arg (*args, gre_header_t *);
+ mpls_unicast_header_t * m = (mpls_unicast_header_t *)(h+1);
+ u32 max_header_bytes = va_arg (*args, u32);
+ uword header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "gre header truncated");
+
+ s = format
+ (s, "GRE-MPLS label %d",
+ vnet_mpls_uc_get_label (clib_net_to_host_u32 (m->label_exp_s_ttl)));
+
+ return s;
+}
+
+u8 * format_mpls_gre_header (u8 * s, va_list * args)
+{
+ gre_header_t * h = va_arg (*args, gre_header_t *);
+ return format (s, "%U", format_mpls_gre_header_with_length, h, 0);
+}
+
+uword
+unformat_mpls_gre_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ gre_header_t _g, * g = &_g;
+ mpls_unicast_header_t _h, * h = &_h;
+ u32 label, label_exp_s_ttl;
+
+ if (! unformat (input, "MPLS %d", &label))
+ return 0;
+
+ g->protocol = clib_host_to_net_u16 (GRE_PROTOCOL_mpls_unicast);
+
+ label_exp_s_ttl = (label<<12) | (1<<8) /* s-bit */ | 0xFF;
+ h->label_exp_s_ttl = clib_host_to_net_u32 (label_exp_s_ttl);
+
+ /* Add gre, mpls headers to result. */
+ {
+ void * p;
+ u32 g_n_bytes = sizeof (g[0]);
+ u32 h_n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, g_n_bytes);
+ memcpy (p, g, g_n_bytes);
+
+ vec_add2 (*result, p, h_n_bytes);
+ memcpy (p, h, h_n_bytes);
+ }
+
+ return 1;
+}
+
+uword
+unformat_mpls_label_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 label;
+
+ if (!unformat (input, "MPLS: label %d", &label))
+ return 0;
+
+ label = (label<<12) | (1<<8) /* s-bit set */ | 0xFF /* ttl */;
+
+ *result = clib_host_to_net_u32 (label);
+ return 1;
+}
+
+mpls_encap_t *
+mpls_encap_by_fib_and_dest (mpls_main_t * mm, u32 rx_fib, u32 dst_address)
+{
+ uword * p;
+ mpls_encap_t * e;
+ u64 key;
+
+ key = ((u64)rx_fib<<32) | ((u64) dst_address);
+ p = hash_get (mm->mpls_encap_by_fib_and_dest, key);
+
+ if (!p)
+ return 0;
+
+ e = pool_elt_at_index (mm->encaps, p[0]);
+ return e;
+}
+
+int vnet_mpls_add_del_encap (ip4_address_t *dest, u32 fib_id,
+ u32 *labels_host_byte_order,
+ u32 policy_tunnel_index,
+ int no_dst_hash, u32 * indexp, int is_add)
+{
+ mpls_main_t * mm = &mpls_main;
+ ip4_main_t * im = &ip4_main;
+ mpls_encap_t * e;
+ u32 label_net_byte_order, label_host_byte_order;
+ u32 fib_index;
+ u64 key;
+ uword *p;
+ int i;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ fib_index = p[0];
+
+ key = ((u64)fib_index<<32) | ((u64) dest->as_u32);
+
+ if (is_add)
+ {
+ pool_get (mm->encaps, e);
+ memset (e, 0, sizeof (*e));
+
+ for (i = 0; i < vec_len (labels_host_byte_order); i++)
+ {
+ mpls_unicast_header_t h;
+ label_host_byte_order = labels_host_byte_order[i];
+
+ /* Reformat label into mpls_unicast_header_t */
+ label_host_byte_order <<= 12;
+ if (i == vec_len(labels_host_byte_order) - 1)
+ label_host_byte_order |= 1<<8; /* S=1 */
+ label_host_byte_order |= 0xff; /* TTL=FF */
+ label_net_byte_order = clib_host_to_net_u32 (label_host_byte_order);
+ h.label_exp_s_ttl = label_net_byte_order;
+ vec_add1 (e->labels, h);
+ }
+ if (no_dst_hash == 0)
+ hash_set (mm->mpls_encap_by_fib_and_dest, key, e - mm->encaps);
+ if (indexp)
+ *indexp = e - mm->encaps;
+ if (policy_tunnel_index != ~0)
+ return vnet_mpls_policy_tunnel_add_rewrite (mm, e, policy_tunnel_index);
+ }
+ else
+ {
+ p = hash_get (mm->mpls_encap_by_fib_and_dest, key);
+ if (!p)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+
+ e = pool_elt_at_index (mm->encaps, p[0]);
+
+ vec_free (e->labels);
+ vec_free (e->rewrite);
+ pool_put(mm->encaps, e);
+
+ if (no_dst_hash == 0)
+ hash_unset (mm->mpls_encap_by_fib_and_dest, key);
+ }
+ return 0;
+}
+
+static clib_error_t *
+mpls_add_encap_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 fib_id;
+ u32 *labels = 0;
+ u32 this_label;
+ ip4_address_t dest;
+ u32 policy_tunnel_index = ~0;
+ int no_dst_hash = 0;
+ int rv;
+ int fib_set = 0;
+ int dest_set = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "fib %d", &fib_id))
+ fib_set = 1;
+ else if (unformat (input, "dest %U", unformat_ip4_address, &dest))
+ dest_set = 1;
+ else if (unformat (input, "no-dst-hash"))
+ no_dst_hash = 1;
+ else if (unformat (input, "label %d", &this_label))
+ vec_add1 (labels, this_label);
+ else if (unformat (input, "policy-tunnel %d", &policy_tunnel_index))
+ ;
+ else
+ break;
+ }
+
+ if (fib_set == 0)
+ return clib_error_return (0, "fib-id missing");
+ if (dest_set == 0)
+ return clib_error_return (0, "destination IP address missing");
+ if (vec_len (labels) == 0)
+ return clib_error_return (0, "label stack missing");
+
+ rv = vnet_mpls_add_del_encap (&dest, fib_id, labels,
+ policy_tunnel_index,
+ no_dst_hash, 0 /* indexp */,
+ 1 /* is_add */);
+ vec_free (labels);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "fib id %d unknown", fib_id);
+
+ default:
+ return clib_error_return (0, "vnet_mpls_add_del_encap returned %d",
+ rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (mpls_add_encap_command, static) = {
+ .path = "mpls encap add",
+ .short_help =
+ "mpls encap add label <label> ... fib <id> dest <ip4-address>",
+ .function = mpls_add_encap_command_fn,
+};
+
+u8 * format_mpls_unicast_header_host_byte_order (u8 * s, va_list * args)
+{
+ mpls_unicast_header_t *h = va_arg(*args, mpls_unicast_header_t *);
+ u32 label = h->label_exp_s_ttl;
+
+ s = format (s, "label %d exp %d, s %d, ttl %d",
+ vnet_mpls_uc_get_label (label),
+ vnet_mpls_uc_get_exp (label),
+ vnet_mpls_uc_get_s (label),
+ vnet_mpls_uc_get_ttl (label));
+ return s;
+}
+
+u8 * format_mpls_unicast_header_net_byte_order (u8 * s, va_list * args)
+{
+ mpls_unicast_header_t *h = va_arg(*args, mpls_unicast_header_t *);
+ mpls_unicast_header_t h_host;
+
+ h_host.label_exp_s_ttl = clib_net_to_host_u32 (h->label_exp_s_ttl);
+
+ return format (s, "%U", format_mpls_unicast_header_host_byte_order,
+ &h_host);
+}
+
+static clib_error_t *
+mpls_del_encap_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 fib_id;
+ ip4_address_t dest;
+ int rv;
+
+ if (unformat (input, "fib %d dest %U", &fib_id,
+ unformat_ip4_address, &dest))
+ {
+ rv = vnet_mpls_add_del_encap (&dest, fib_id, 0 /* labels */,
+ ~0 /* policy_tunnel_index */,
+ 0 /* no_dst_hash */,
+ 0 /* indexp */,
+ 0 /* is_add */);
+ switch (rv)
+ {
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "fib id %d unknown", fib_id);
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "dest %U not in fib %d",
+ format_ip4_address, &dest, fib_id);
+ default:
+ break;
+ }
+ return 0;
+ }
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+}
+
+VLIB_CLI_COMMAND (mpls_del_encap_command, static) = {
+ .path = "mpls encap delete",
+ .short_help = "mpls encap delete fib <id> dest <ip4-address>",
+ .function = mpls_del_encap_command_fn,
+};
+
+int vnet_mpls_add_del_decap (u32 rx_fib_id,
+ u32 tx_fib_id,
+ u32 label_host_byte_order,
+ int s_bit, int next_index, int is_add)
+{
+ mpls_main_t * mm = &mpls_main;
+ ip4_main_t * im = &ip4_main;
+ mpls_decap_t * d;
+ u32 rx_fib_index, tx_fib_index_or_output_swif_index;
+ uword *p;
+ u64 key;
+
+ p = hash_get (im->fib_index_by_table_id, rx_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ rx_fib_index = p[0];
+
+ /* L3 decap => transform fib ID to fib index */
+ if (next_index == MPLS_INPUT_NEXT_IP4_INPUT)
+ {
+ p = hash_get (im->fib_index_by_table_id, tx_fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_INNER_FIB;
+
+ tx_fib_index_or_output_swif_index = p[0];
+ }
+ else
+ {
+ /* L2 decap, tx_fib_id is actually the output sw_if_index */
+ tx_fib_index_or_output_swif_index = tx_fib_id;
+ }
+
+ key = ((u64)rx_fib_index<<32) | ((u64) (label_host_byte_order<<12))
+ | ((u64) s_bit<<8);
+
+ p = hash_get (mm->mpls_decap_by_rx_fib_and_label, key);
+
+ /* If deleting, or replacing an old entry */
+ if (is_add == 0 || p)
+ {
+ if (is_add == 0 && p == 0)
+ return VNET_API_ERROR_NO_SUCH_LABEL;
+
+ d = pool_elt_at_index (mm->decaps, p[0]);
+ hash_unset (mm->mpls_decap_by_rx_fib_and_label, key);
+ pool_put (mm->decaps, d);
+ /* Deleting, we're done... */
+ if (is_add == 0)
+ return 0;
+ }
+
+ /* add decap entry... */
+ pool_get (mm->decaps, d);
+ memset (d, 0, sizeof (*d));
+ d->tx_fib_index = tx_fib_index_or_output_swif_index;
+ d->next_index = next_index;
+
+ hash_set (mm->mpls_decap_by_rx_fib_and_label, key, d - mm->decaps);
+
+ return 0;
+}
+
+uword
+unformat_mpls_gre_input_next (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ int rv = 0;
+
+ if (unformat (input, "lookup"))
+ {
+ *result = MPLS_INPUT_NEXT_IP4_INPUT;
+ rv = 1;
+ }
+ else if (unformat (input, "output"))
+ {
+ *result = MPLS_INPUT_NEXT_L2_OUTPUT;
+ rv = 1;
+ }
+ return rv;
+}
+
+static clib_error_t *
+mpls_add_decap_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u32 rx_fib_id = 0;
+ u32 tx_fib_or_sw_if_index;
+ u32 label;
+ int s_bit = 1;
+ u32 next_index = 1; /* ip4_lookup, see node.c */
+ int tx_fib_id_set = 0;
+ int label_set = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "fib %d", &tx_fib_or_sw_if_index))
+ tx_fib_id_set = 1;
+ else if (unformat (input, "sw_if_index %d", &tx_fib_or_sw_if_index))
+ tx_fib_id_set = 1;
+ else if (unformat (input, "%U", unformat_vnet_sw_interface, vnm,
+ &tx_fib_or_sw_if_index))
+ tx_fib_id_set = 1;
+ else if (unformat (input, "rx-fib %d", &rx_fib_id))
+ ;
+ else if (unformat (input, "label %d", &label))
+ label_set = 1;
+ else if (unformat (input, "s-bit-clear"))
+ s_bit = 0;
+ else if (unformat (input, "next %U", unformat_mpls_gre_input_next,
+ &next_index))
+ ;
+ else
+ break;
+ }
+
+ if (tx_fib_id_set == 0)
+ return clib_error_return (0, "lookup FIB ID not set");
+ if (label_set == 0)
+ return clib_error_return (0, "missing label");
+
+ rv = vnet_mpls_add_del_decap (rx_fib_id, tx_fib_or_sw_if_index,
+ label, s_bit, next_index, 1 /* is_add */);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "no such rx fib id %d", rx_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_INNER_FIB:
+ return clib_error_return (0, "no such tx fib / swif %d",
+ tx_fib_or_sw_if_index);
+
+ default:
+ return clib_error_return (0, "vnet_mpls_add_del_decap returned %d",
+ rv);
+ }
+ return 0;
+}
+
+VLIB_CLI_COMMAND (mpls_add_decap_command, static) = {
+ .path = "mpls decap add",
+ .short_help =
+ "mpls decap add fib <id> label <nn> [s-bit-clear] [next-index <nn>]",
+ .function = mpls_add_decap_command_fn,
+};
+
+static clib_error_t *
+mpls_del_decap_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u32 rx_fib_id = 0;
+ u32 tx_fib_id = 0;
+ u32 label;
+ int s_bit = 1;
+ int label_set = 0;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "rx-fib %d", &rx_fib_id))
+ ;
+ else if (unformat (input, "label %d", &label))
+ label_set = 1;
+ else if (unformat (input, "s-bit-clear"))
+ s_bit = 0;
+ }
+
+ if (!label_set)
+ return clib_error_return (0, "label not set");
+
+ rv = vnet_mpls_add_del_decap (rx_fib_id,
+ tx_fib_id /* not interesting */,
+ label, s_bit,
+ 0 /* next_index not interesting */,
+ 0 /* is_add */);
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case VNET_API_ERROR_NO_SUCH_FIB:
+ return clib_error_return (0, "no such rx fib id %d", rx_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_INNER_FIB:
+ return clib_error_return (0, "no such lookup fib id %d", tx_fib_id);
+
+ case VNET_API_ERROR_NO_SUCH_LABEL:
+ return clib_error_return (0, "no such label %d rx fib id %d",
+ label, rx_fib_id);
+
+ default:
+ return clib_error_return (0, "vnet_mpls_add_del_decap returned %d",
+ rv);
+ }
+ return 0;
+}
+
+
+VLIB_CLI_COMMAND (mpls_del_decap_command, static) = {
+ .path = "mpls decap delete",
+ .short_help = "mpls decap delete label <label> rx-fib <id> [s-bit-clear]",
+ .function = mpls_del_decap_command_fn,
+};
+
+typedef struct {
+ u32 fib_index;
+ u32 entry_index;
+ u32 dest;
+ u32 s_bit;
+ u32 label;
+} show_mpls_fib_t;
+
+static clib_error_t *
+show_mpls_fib_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u64 key;
+ u32 value;
+ show_mpls_fib_t *records = 0;
+ show_mpls_fib_t *s;
+ mpls_main_t * mm = &mpls_main;
+ ip4_main_t * im = &ip4_main;
+ ip4_fib_t * rx_fib, * tx_fib;
+ u32 tx_table_id;
+ char *swif_tag;
+
+ hash_foreach (key, value, mm->mpls_encap_by_fib_and_dest,
+ ({
+ vec_add2 (records, s, 1);
+ s->fib_index = (u32)(key>>32);
+ s->dest = (u32)(key & 0xFFFFFFFF);
+ s->entry_index = (u32) value;
+ }));
+
+ if (!vec_len(records))
+ {
+ vlib_cli_output (vm, "MPLS encap table empty");
+ goto decap_table;
+ }
+ /* sort output by dst address within fib */
+ vec_sort (records, r0, r1, clib_net_to_host_u32(r0->dest) -
+ clib_net_to_host_u32(r1->dest));
+ vec_sort (records, r0, r1, r0->fib_index - r1->fib_index);
+ vlib_cli_output (vm, "MPLS encap table");
+ vlib_cli_output (vm, "%=6s%=16s%=16s", "Table", "Dest address", "Labels");
+ vec_foreach (s, records)
+ {
+ rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+ vlib_cli_output (vm, "%=6d%=16U%=16U", rx_fib->table_id,
+ format_ip4_address, &s->dest,
+ format_mpls_encap_index, mm, s->entry_index);
+ }
+
+ decap_table:
+ vec_reset_length(records);
+
+ hash_foreach (key, value, mm->mpls_decap_by_rx_fib_and_label,
+ ({
+ vec_add2 (records, s, 1);
+ s->fib_index = (u32)(key>>32);
+ s->entry_index = (u32) value;
+ s->label = ((u32) key)>>12;
+ s->s_bit = (key & (1<<8)) != 0;
+ }));
+
+ if (!vec_len(records))
+ {
+ vlib_cli_output (vm, "MPLS decap table empty");
+ goto out;
+ }
+
+ vec_sort (records, r0, r1, r0->label - r1->label);
+
+ vlib_cli_output (vm, "MPLS decap table");
+ vlib_cli_output (vm, "%=10s%=15s%=6s%=6s", "RX Table", "TX Table/Intfc",
+ "Label", "S-bit");
+ vec_foreach (s, records)
+ {
+ mpls_decap_t * d;
+ d = pool_elt_at_index (mm->decaps, s->entry_index);
+ if (d->next_index == MPLS_INPUT_NEXT_IP4_INPUT)
+ {
+ tx_fib = vec_elt_at_index (im->fibs, d->tx_fib_index);
+ tx_table_id = tx_fib->table_id;
+ swif_tag = " ";
+ }
+ else
+ {
+ tx_table_id = d->tx_fib_index;
+ swif_tag = "(i) ";
+ }
+ rx_fib = vec_elt_at_index (im->fibs, s->fib_index);
+
+ vlib_cli_output (vm, "%=10d%=10d%=5s%=6d%=6d", rx_fib->table_id,
+ tx_table_id, swif_tag, s->label, s->s_bit);
+ }
+
+ out:
+ vec_free(records);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_mpls_fib_command, static) = {
+ .path = "show mpls fib",
+ .short_help = "show mpls fib",
+ .function = show_mpls_fib_command_fn,
+};
+
+int mpls_fib_reset_labels (u32 fib_id)
+{
+ u64 key;
+ u32 value;
+ show_mpls_fib_t *records = 0;
+ show_mpls_fib_t *s;
+ mpls_main_t * mm = &mpls_main;
+ ip4_main_t * im = &ip4_main;
+ u32 fib_index;
+ uword *p;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (! p)
+ return VNET_API_ERROR_NO_SUCH_FIB;
+
+ fib_index = p[0];
+
+ hash_foreach (key, value, mm->mpls_encap_by_fib_and_dest,
+ ({
+ if (fib_index == (u32)(key>>32)) {
+ vec_add2 (records, s, 1);
+ s->dest = (u32)(key & 0xFFFFFFFF);
+ s->entry_index = (u32) value;
+ }
+ }));
+
+ vec_foreach (s, records)
+ {
+ key = ((u64)fib_index<<32) | ((u64) s->dest);
+ hash_unset (mm->mpls_encap_by_fib_and_dest, key);
+ pool_put_index (mm->encaps, s->entry_index);
+ }
+
+ vec_reset_length(records);
+
+ hash_foreach (key, value, mm->mpls_decap_by_rx_fib_and_label,
+ ({
+ if (fib_index == (u32) (key>>32)) {
+ vec_add2 (records, s, 1);
+ s->entry_index = value;
+ s->fib_index = fib_index;
+ s->s_bit = key & (1<<8);
+ s->dest = (u32)((key & 0xFFFFFFFF)>>12);
+ }
+ }));
+
+ vec_foreach (s, records)
+ {
+ key = ((u64)fib_index <<32) | ((u64)(s->dest<<12)) |
+ ((u64)s->s_bit);
+
+ hash_unset (mm->mpls_decap_by_rx_fib_and_label, key);
+ pool_put_index (mm->decaps, s->entry_index);
+ }
+
+ vec_free(records);
+ return 0;
+}
+
+static clib_error_t * mpls_init (vlib_main_t * vm)
+{
+ mpls_main_t * mm = &mpls_main;
+ clib_error_t * error;
+
+ memset (mm, 0, sizeof (mm[0]));
+ mm->vlib_main = vm;
+ mm->vnet_main = vnet_get_main();
+
+ if ((error = vlib_call_init_function (vm, ip_main_init)))
+ return error;
+
+ mm->mpls_encap_by_fib_and_dest = hash_create (0, sizeof (uword));
+ mm->mpls_decap_by_rx_fib_and_label = hash_create (0, sizeof (uword));
+
+ return vlib_call_init_function (vm, mpls_input_init);
+}
+
+VLIB_INIT_FUNCTION (mpls_init);
+
+mpls_main_t * mpls_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, mpls_init);
+ return &mpls_main;
+}
+
diff --git a/vnet/vnet/mpls-gre/mpls.h b/vnet/vnet/mpls-gre/mpls.h
new file mode 100644
index 00000000000..5d7f9c5e219
--- /dev/null
+++ b/vnet/vnet/mpls-gre/mpls.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_mpls_gre_h
+#define included_vnet_mpls_gre_h
+
+#include <vnet/vnet.h>
+#include <vnet/gre/gre.h>
+#include <vnet/mpls-gre/packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ethernet/ethernet.h>
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4; /* 20 bytes */
+ gre_header_t gre; /* 4 bytes */
+ mpls_unicast_header_t labels[0]; /* 4 bytes each */
+}) ip4_gre_and_mpls_header_t;
+
+vnet_hw_interface_class_t mpls_gre_hw_interface_class;
+
+typedef enum {
+#define mpls_error(n,s) MPLS_ERROR_##n,
+#include <vnet/mpls-gre/error.def>
+#undef mpls_error
+ MPLS_N_ERROR,
+} mpls_gre_error_t;
+
+/*
+ * No protocol info, MPLS labels don't have a next-header field
+ * presumably the label field tells all...
+ */
+
+typedef struct {
+ ip4_address_t tunnel_src;
+ ip4_address_t tunnel_dst;
+ ip4_address_t intfc_address;
+ u32 mask_width;
+ u32 inner_fib_index;
+ u32 outer_fib_index;
+ u32 encap_index;
+ u32 hw_if_index; /* L2 x-connect capable tunnel intfc */
+ u8 * rewrite_data;
+ u8 l2_only;
+} mpls_gre_tunnel_t;
+
+typedef struct {
+ u8 tunnel_dst[6];
+ ip4_address_t intfc_address;
+ u32 tx_sw_if_index;
+ u32 inner_fib_index;
+ u32 mask_width;
+ u32 encap_index;
+ u32 hw_if_index;
+ u8 * rewrite_data;
+ u8 l2_only;
+} mpls_eth_tunnel_t;
+
+typedef struct {
+ mpls_unicast_header_t *labels;
+ /* only for policy tunnels */
+ u8 * rewrite;
+ u32 output_next_index;
+} mpls_encap_t;
+
+typedef struct {
+ u32 tx_fib_index;
+ u32 next_index; /* e.g. ip4/6-input, l2-input */
+} mpls_decap_t;
+
+typedef struct {
+ /* pool of gre tunnel instances */
+ mpls_gre_tunnel_t *gre_tunnels;
+ u32 * free_gre_sw_if_indices;
+
+ /* pool of ethernet tunnel instances */
+ mpls_eth_tunnel_t *eth_tunnels;
+ u32 * free_eth_sw_if_indices;
+
+ /* Encap side: map (fib, dst_address) to mpls label stack */
+ mpls_encap_t * encaps;
+ uword * mpls_encap_by_fib_and_dest;
+
+ /* Decap side: map rx label to FIB */
+ mpls_decap_t * decaps;
+ uword * mpls_decap_by_rx_fib_and_label;
+
+ /* mpls-o-e policy tunnel next index for ip4-classify */
+ u32 ip_classify_mpls_policy_encap_next_index;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} mpls_main_t;
+
+mpls_main_t mpls_main;
+
+format_function_t format_mpls_protocol;
+format_function_t format_mpls_header;
+format_function_t format_mpls_header_with_length;
+format_function_t format_mpls_gre_header_with_length;
+format_function_t format_mpls_eth_header_with_length;
+format_function_t format_mpls_unicast_label;
+format_function_t format_mpls_encap_index;
+
+vlib_node_registration_t mpls_input_node;
+vlib_node_registration_t mpls_policy_encap_node;
+
+vnet_device_class_t mpls_gre_device_class;
+
+/* Parse mpls protocol as 0xXXXX or protocol name.
+ In either host or network byte order. */
+unformat_function_t unformat_mpls_protocol_host_byte_order;
+unformat_function_t unformat_mpls_protocol_net_byte_order;
+unformat_function_t unformat_mpls_label_net_byte_order;
+unformat_function_t unformat_mpls_gre_header;
+unformat_function_t unformat_pg_mpls_gre_header;
+
+/* Parse mpls header. */
+unformat_function_t unformat_mpls_header;
+unformat_function_t unformat_pg_mpls_header;
+
+/* manually added to the interface output node in mpls.c */
+#define MPLS_GRE_OUTPUT_NEXT_LOOKUP 1
+#define MPLS_GRE_OUTPUT_NEXT_DROP VNET_INTERFACE_TX_NEXT_DROP
+
+mpls_encap_t *
+mpls_encap_by_fib_and_dest (mpls_main_t * mm, u32 rx_fib, u32 dst_address);
+
+int mpls_label_from_fib_id_and_dest (mpls_main_t *gm, u32 fib_id,
+ u32 dst_address, u32 *labelp);
+
+int vnet_mpls_gre_add_del_tunnel (ip4_address_t *src,
+ ip4_address_t *dst,
+ ip4_address_t *intfc,
+ u32 mask_width,
+ u32 inner_fib_id, u32 outer_fib_id,
+ u32 * tunnel_intfc_sw_if_index,
+ u8 l2_only,
+ u8 is_add);
+
+int vnet_mpls_ethernet_add_del_tunnel (u8 *dst,
+ ip4_address_t *intfc,
+ u32 mask_width,
+ u32 inner_fib_id,
+ u32 tx_sw_if_index,
+ u32 * tunnel_sw_if_index,
+ u8 l2_only,
+ u8 is_add);
+
+int vnet_mpls_gre_delete_fib_tunnels (u32 fib_id);
+
+int mpls_fib_reset_labels (u32 fib_id);
+
+int vnet_mpls_add_del_decap (u32 rx_fib_id,
+ u32 tx_fib_id,
+ u32 label_host_byte_order,
+ int s_bit, int next_index, int is_add);
+
+int vnet_mpls_add_del_encap (ip4_address_t *dest, u32 fib_id,
+ u32 *labels_host_byte_order,
+ u32 policy_tunnel_index,
+ int no_dst_hash, u32 * indexp, int is_add);
+
+int vnet_mpls_policy_tunnel_add_rewrite (mpls_main_t * mm,
+ mpls_encap_t * e,
+ u32 policy_tunnel_index);
+typedef struct {
+ u32 lookup_miss;
+
+ /* Tunnel-id / index in tunnel vector */
+ u32 tunnel_id;
+
+ /* mpls encap index */
+ u32 mpls_encap_index;
+
+ /* pkt length */
+ u32 length;
+
+ /* tunnel ip4 addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+} mpls_gre_tx_trace_t;
+
+u8 * format_mpls_gre_tx_trace (u8 * s, va_list * args);
+u8 * format_mpls_gre_header (u8 * s, va_list * args);
+
+#define foreach_mpls_input_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(L2_OUTPUT, "l2-output")
+
+typedef enum {
+#define _(s,n) MPLS_INPUT_NEXT_##s,
+ foreach_mpls_input_next
+#undef _
+ MPLS_INPUT_N_NEXT,
+} mpls_input_next_t;
+
+
+typedef struct {
+ u32 lookup_miss;
+
+ /* Tunnel-id / index in tunnel vector */
+ u32 tunnel_id;
+
+ /* output interface */
+ u32 tx_sw_if_index;
+
+ /* mpls encap index */
+ u32 mpls_encap_index;
+
+ /* pkt length */
+ u32 length;
+
+ u8 dst[6];
+} mpls_eth_tx_trace_t;
+
+u8 * format_mpls_eth_tx_trace (u8 * s, va_list * args);
+
+#endif /* included_vnet_mpls_gre_h */
diff --git a/vnet/vnet/mpls-gre/node.c b/vnet/vnet/mpls-gre/node.c
new file mode 100644
index 00000000000..6bf5f814aec
--- /dev/null
+++ b/vnet/vnet/mpls-gre/node.c
@@ -0,0 +1,359 @@
+/*
+ * node.c: mpls-o-gre decap processing
+ *
+ * Copyright (c) 2012-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/mpls-gre/mpls.h>
+
+typedef struct {
+ u32 next_index;
+ u32 decap_index;
+ u32 tx_fib_index;
+ u32 label_host_byte_order;
+} mpls_rx_trace_t;
+
+u8 * format_mpls_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mpls_rx_trace_t * t = va_arg (*args, mpls_rx_trace_t *);
+ char * next_name;
+
+ next_name = "BUG!";
+
+#define _(a,b) if (t->next_index == MPLS_INPUT_NEXT_##a) next_name = b;
+ foreach_mpls_input_next;
+#undef _
+
+ s = format (s, "MPLS: next %s, lookup fib index %d, decap index %d\n",
+ next_name, t->next_index, t->tx_fib_index, t->decap_index);
+ if (t->decap_index != ~0)
+ {
+ s = format (s, " label %d",
+ vnet_mpls_uc_get_label(t->label_host_byte_order));
+ }
+ return s;
+}
+
+vlib_node_registration_t mpls_input_node;
+
+typedef struct {
+ u32 last_label;
+ u32 last_inner_fib_index;
+ u32 last_outer_fib_index;
+ mpls_main_t * mpls_main;
+} mpls_input_runtime_t;
+
+static inline uword
+mpls_input_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame, int is_mpls_o_gre)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ ip4_main_t * im = &ip4_main;
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+ mpls_input_runtime_t * rt;
+ mpls_main_t * mm;
+
+ rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
+ mm = rt->mpls_main;
+ /*
+ * Force an initial lookup every time, in case the control-plane
+ * changed the label->FIB mapping.
+ */
+ rt->last_label = ~0;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ mpls_unicast_header_t * h0, * h1;
+ int li0, li1;
+ u64 key0, key1;
+ u32 label0, label1;
+ u32 next0, next1;
+ uword * p0, * p1;
+ u32 fib_index0, fib_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* $$$$$ dual loop me */
+
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ mpls_unicast_header_t * h0;
+ u32 label0;
+ u32 next0;
+ u64 key0;
+ uword * p0;
+ u32 rx_fib_index0;
+ mpls_decap_t *d0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ if (is_mpls_o_gre)
+ {
+ rx_fib_index0 = vec_elt (im->fib_index_by_sw_if_index,
+ vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+ }
+ else
+ {
+#if 0
+ /* If separate RX numbering spaces are required... */
+ rx_fib_index0 = vec_elt (mm->fib_index_by_sw_if_index,
+ vnet_buffer(b0)->sw_if_index[VLIB_RX]);
+#endif
+ rx_fib_index0 = 0;
+ }
+
+ next0 = ~0;
+ d0 = 0;
+
+ /*
+ * Expect the control-plane team to squeal like pigs.
+ * If they don't program a decap label entry for each
+ * and every label in the stack, packets go into the trash...
+ */
+
+ do
+ {
+ label0 = clib_net_to_host_u32 (h0->label_exp_s_ttl);
+ /* TTL expired? */
+ if (PREDICT_FALSE(vnet_mpls_uc_get_ttl (label0) == 0))
+ {
+ next0 = MPLS_INPUT_NEXT_DROP;
+ b0->error = node->errors[MPLS_ERROR_TTL_EXPIRED];
+ break;
+ }
+
+ key0 = ((u64)rx_fib_index0<<32)
+ | ((u64)vnet_mpls_uc_get_label (label0)<<12)
+ | ((u64)vnet_mpls_uc_get_s (label0)<<8);
+
+ /*
+ * The architecture crew claims that we won't need
+ * separate ip4, ip6, mpls-o-ethernet label numbering
+ * spaces. Use the low 8 key bits as a discriminator.
+ */
+
+ p0 = hash_get (mm->mpls_decap_by_rx_fib_and_label, key0);
+ if (p0 == 0)
+ {
+ next0 = MPLS_INPUT_NEXT_DROP;
+ b0->error = node->errors[MPLS_ERROR_BAD_LABEL];
+ break;
+ }
+ d0 = pool_elt_at_index (mm->decaps, p0[0]);
+ next0 = d0->next_index;
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = d0->tx_fib_index;
+ vlib_buffer_advance (b0, sizeof (*h0));
+ h0 = vlib_buffer_get_current (b0);
+ } while (!vnet_mpls_uc_get_s(label0));
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->decap_index = d0 ? d0 - mm->decaps : ~0;
+ tr->tx_fib_index = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ tr->label_host_byte_order = label0;
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, mpls_input_node.index,
+ MPLS_ERROR_PKTS_DECAP, from_frame->n_vectors);
+ return from_frame->n_vectors;
+}
+
+static uword
+mpls_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return mpls_input_inline (vm, node, from_frame, 1 /* is mpls-o-gre */);
+}
+
+static char * mpls_error_strings[] = {
+#define mpls_error(n,s) s,
+#include "error.def"
+#undef mpls_error
+};
+
+VLIB_REGISTER_NODE (mpls_input_node) = {
+ .function = mpls_input,
+ .name = "mpls-gre-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof(mpls_input_runtime_t),
+
+ .n_errors = MPLS_N_ERROR,
+ .error_strings = mpls_error_strings,
+
+ .n_next_nodes = MPLS_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [MPLS_INPUT_NEXT_##s] = n,
+ foreach_mpls_input_next
+#undef _
+ },
+
+ .format_buffer = format_mpls_gre_header_with_length,
+ .format_trace = format_mpls_rx_trace,
+ .unformat_buffer = unformat_mpls_gre_header,
+};
+
+static uword
+mpls_ethernet_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ return mpls_input_inline (vm, node, from_frame, 0 /* is mpls-o-gre */);
+}
+
+
+VLIB_REGISTER_NODE (mpls_ethernet_input_node) = {
+ .function = mpls_ethernet_input,
+ .name = "mpls-ethernet-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof(mpls_input_runtime_t),
+
+ .n_errors = MPLS_N_ERROR,
+ .error_strings = mpls_error_strings,
+
+ .n_next_nodes = MPLS_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [MPLS_INPUT_NEXT_##s] = n,
+ foreach_mpls_input_next
+#undef _
+ },
+
+ .format_buffer = format_mpls_eth_header_with_length,
+ .format_trace = format_mpls_rx_trace,
+ .unformat_buffer = unformat_mpls_gre_header,
+};
+
+static void
+mpls_setup_nodes (vlib_main_t * vm)
+{
+ vlib_node_t * n = vlib_get_node (vm, mpls_input_node.index);
+ pg_node_t * pn = pg_get_node (mpls_input_node.index);
+ mpls_input_runtime_t * rt;
+
+ n->format_buffer = format_mpls_gre_header_with_length;
+ n->unformat_buffer = unformat_mpls_gre_header;
+ pn->unformat_edit = unformat_pg_mpls_header;
+
+ rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
+ rt->last_label = (u32) ~0;
+ rt->last_inner_fib_index = 0;
+ rt->last_outer_fib_index = 0;
+ rt->mpls_main = &mpls_main;
+
+ n = vlib_get_node (vm, mpls_ethernet_input_node.index);
+
+ n->format_buffer = format_mpls_eth_header_with_length;
+
+ n->unformat_buffer = 0; /* unformat_mpls_ethernet_header; */
+
+ rt = vlib_node_get_runtime_data (vm, mpls_ethernet_input_node.index);
+ rt->last_label = (u32) ~0;
+ rt->last_inner_fib_index = 0;
+ rt->last_outer_fib_index = 0;
+ rt->mpls_main = &mpls_main;
+
+ ethernet_register_input_type (vm, ETHERNET_TYPE_MPLS_UNICAST,
+ mpls_ethernet_input_node.index);
+}
+
+static clib_error_t * mpls_input_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+
+ error = vlib_call_init_function (vm, mpls_init);
+ if (error)
+ clib_error_report (error);
+
+ mpls_setup_nodes (vm);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (mpls_input_init);
diff --git a/vnet/vnet/mpls-gre/packet.h b/vnet/vnet/mpls-gre/packet.h
new file mode 100644
index 00000000000..baa01818f09
--- /dev/null
+++ b/vnet/vnet/mpls-gre/packet.h
@@ -0,0 +1,49 @@
+#ifndef included_vnet_mpls_packet_h
+#define included_vnet_mpls_packet_h
+
+/*
+ * MPLS packet format
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+typedef struct {
+ /* Label: top 20 bits [in network byte order] */
+ /* Experimental: 3 bits ... */
+ /* S (bottom of label stack): 1 bit */
+ /* TTL: 8 bits */
+ u32 label_exp_s_ttl;
+} mpls_unicast_header_t;
+
+static inline u32 vnet_mpls_uc_get_label (u32 label_exp_s_ttl)
+{
+ return (label_exp_s_ttl>>12);
+}
+
+static inline u32 vnet_mpls_uc_get_exp (u32 label_exp_s_ttl)
+{
+ return ((label_exp_s_ttl>>9) & 0x7);
+}
+
+static inline u32 vnet_mpls_uc_get_s (u32 label_exp_s_ttl)
+{
+ return ((label_exp_s_ttl>>8) & 0x1);
+}
+
+static inline u32 vnet_mpls_uc_get_ttl (u32 label_exp_s_ttl)
+{
+ return (label_exp_s_ttl & 0xff);
+}
+
+#endif /* included_vnet_mpls_packet_h */
diff --git a/vnet/vnet/mpls-gre/pg.c b/vnet/vnet/mpls-gre/pg.c
new file mode 100644
index 00000000000..6b6a1017c58
--- /dev/null
+++ b/vnet/vnet/mpls-gre/pg.c
@@ -0,0 +1,71 @@
+/*
+ * pg.c: packet generator mpls/gre interface
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/gre/gre.h>
+#include <vnet/mpls-gre/mpls.h>
+
+typedef struct {
+ pg_edit_t label;
+} pg_mpls_header_t;
+
+static inline void
+pg_mpls_header_init (pg_mpls_header_t * e)
+{
+ pg_edit_init (&e->label, mpls_unicast_header_t, label_exp_s_ttl);
+}
+
+uword
+unformat_pg_mpls_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_mpls_header_t * h;
+ vlib_main_t * vm = vlib_get_main();
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (mpls_unicast_header_t),
+ &group_index);
+ pg_mpls_header_init (h);
+
+ error = 1;
+ if (! unformat (input, "%U",
+ unformat_pg_edit,
+ unformat_mpls_label_net_byte_order, &h->label))
+ goto done;
+
+ {
+ pg_node_t * pg_node = 0;
+ vlib_node_t * ip_lookup_node;
+
+ ip_lookup_node = vlib_get_node_by_name (vm, (u8 *)"ip4-input");
+ ASSERT (ip_lookup_node);
+
+ pg_node = pg_get_node (ip_lookup_node->index);
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/mpls-gre/policy_encap.c b/vnet/vnet/mpls-gre/policy_encap.c
new file mode 100644
index 00000000000..53411515e69
--- /dev/null
+++ b/vnet/vnet/mpls-gre/policy_encap.c
@@ -0,0 +1,172 @@
+/*
+ * policy_encap.c: mpls-o-e policy encap
+ *
+ * Copyright (c) 2012-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/mpls-gre/mpls.h>
+
+typedef struct {
+ u32 next_index;
+ u32 encap_index;
+} mpls_policy_encap_trace_t;
+
+u8 * format_mpls_policy_encap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ mpls_policy_encap_trace_t * t = va_arg (*args, mpls_policy_encap_trace_t *);
+
+ s = format (s, "MPLS-POLICY-ENCAP: next-index %d encap-index %d",
+ t->next_index, t->encap_index);
+
+ return s;
+}
+
+vlib_node_registration_t mpls_policy_encap_node;
+
+#define foreach_mpls_policy_encap_next \
+_(DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) MPLS_POLICY_ENCAP_NEXT_##s,
+ foreach_mpls_policy_encap_next
+#undef _
+ MPLS_POLICY_ENCAP_N_NEXT,
+} mpls_policy_encap_next_t;
+
+#define foreach_mpls_policy_error \
+_(PKTS_ENCAP, "mpls policy tunnel packets encapsulated")
+
+typedef enum {
+#define _(n,s) MPLS_POLICY_ENCAP_ERROR_##n,
+ foreach_mpls_policy_error
+ MPLS_POLICY_ENCAP_N_ERROR,
+#undef _
+} mpls_policy_encap_error_t;
+
+static char * mpls_policy_encap_error_strings[] =
+ {
+#define _(n,s) s,
+ foreach_mpls_policy_error
+#undef _
+};
+
+static uword
+mpls_policy_encap (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ mpls_main_t * mm = &mpls_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u8 * h0;
+ u32 encap_index0;
+ u32 next0;
+ mpls_encap_t * e0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ encap_index0 = vnet_buffer(b0)->l2_classify.opaque_index;
+
+ e0 = pool_elt_at_index (mm->encaps, encap_index0);
+
+ vlib_buffer_advance (b0, -(word)vec_len(e0->rewrite));
+ h0 = vlib_buffer_get_current (b0);
+ memcpy (h0, e0->rewrite, vec_len(e0->rewrite));
+
+ next0 = e0->output_next_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ mpls_policy_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->encap_index = encap_index0;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, mpls_policy_encap_node.index,
+ MPLS_POLICY_ENCAP_ERROR_PKTS_ENCAP,
+ from_frame->n_vectors);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (mpls_policy_encap_node) = {
+ .function = mpls_policy_encap,
+ .name = "mpls-policy-encap",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = 0,
+
+ .n_errors = MPLS_POLICY_ENCAP_N_ERROR,
+ .error_strings = mpls_policy_encap_error_strings,
+
+ .format_trace = format_mpls_policy_encap_trace,
+
+ .n_next_nodes = MPLS_POLICY_ENCAP_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [MPLS_POLICY_ENCAP_NEXT_##s] = n,
+ foreach_mpls_policy_encap_next
+#undef _
+ },
+};
+
+static clib_error_t *
+mpls_policy_encap_init (vlib_main_t * vm)
+{
+ mpls_main_t * mm = &mpls_main;
+ clib_error_t * error;
+
+ if ((error = vlib_call_init_function (vm, mpls_init)))
+ return error;
+
+ mm->ip_classify_mpls_policy_encap_next_index =
+ vlib_node_add_next (mm->vlib_main,
+ ip4_classify_node.index,
+ mpls_policy_encap_node.index);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (mpls_policy_encap_init);
diff --git a/vnet/vnet/nsh-gre/decap.c b/vnet/vnet/nsh-gre/decap.c
new file mode 100644
index 00000000000..9ef52ffb6de
--- /dev/null
+++ b/vnet/vnet/nsh-gre/decap.c
@@ -0,0 +1,365 @@
+/*
+ * nsh.c: nsh packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/nsh-gre/nsh_gre.h>
+#include <vnet/nsh-gre/nsh_gre_packet.h>
+
+vlib_node_registration_t nsh_input_node;
+
+typedef struct {
+ u32 next_index;
+ u32 tunnel_index;
+ u32 error;
+ nsh_header_t h;
+} nsh_rx_trace_t;
+
+
+u8 * format_nsh_header_with_length (u8 * s, va_list * args)
+{
+ nsh_header_t * h = va_arg (*args, nsh_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ u32 tmp, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "gre-nsh header truncated");
+
+ s = format (s, "ver %d ", h->ver_o_c>>6);
+
+ if (h->ver_o_c & NSH_GRE_O_BIT)
+ s = format (s, "O-set ");
+
+ if (h->ver_o_c & NSH_GRE_C_BIT)
+ s = format (s, "C-set ");
+
+ s = format (s, "len %d (%d bytes) md_type %d next_protocol %d\n",
+ h->length, h->length * 4, h->md_type, h->next_protocol);
+
+ tmp = clib_net_to_host_u32 (h->spi_si);
+
+ s = format (s, " spi %d si %d ",
+ (tmp>>NSH_GRE_SPI_SHIFT) & NSH_GRE_SPI_MASK,
+ tmp & NSH_GRE_SINDEX_MASK);
+
+ s = format (s, "c1 %u c2 %u c3 %u c4 %u",
+ clib_net_to_host_u32 (h->c1),
+ clib_net_to_host_u32 (h->c2),
+ clib_net_to_host_u32 (h->c3),
+ clib_net_to_host_u32 (h->c4));
+
+ return s;
+}
+
+
+u8 * format_nsh_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ nsh_rx_trace_t * t = va_arg (*args, nsh_rx_trace_t *);
+
+ if (t->tunnel_index != ~0)
+ {
+ s = format (s, "NSH: tunnel %d next %d error %d", t->tunnel_index,
+ t->next_index, t->error);
+ }
+ else
+ {
+ s = format (s, "NSH: no tunnel next %d error %d\n", t->next_index,
+ t->error);
+ }
+ s = format (s, "\n %U", format_nsh_header_with_length, &t->h,
+ (u32) sizeof (t->h) /* max size */);
+ return s;
+}
+
+static uword
+nsh_gre_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+ u32 last_tunnel_index = ~0;
+ u64 last_key = ~0ULL;
+ u32 pkts_decapsulated = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ nsh_header_t * h0, * h1;
+ uword * p0, * p1;
+ u32 tunnel_index0, tunnel_index1;
+ nsh_gre_tunnel_t * t0, * t1;
+ u64 key0, key1;
+ u32 error0, error1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = vlib_buffer_get_current (b0);
+ h1 = vlib_buffer_get_current (b1);
+
+ /* gre stashed the src ip4 address for us... */
+ key0 = (((u64)(vnet_buffer(b0)->gre.src))<<32) | h0->spi_si;
+ key1 = (((u64)(vnet_buffer(b1)->gre.src))<<32) | h1->spi_si;
+
+ /* "pop" nsh header */
+ vlib_buffer_advance (b0, sizeof (*h0));
+ vlib_buffer_advance (b1, sizeof (*h1));
+
+ tunnel_index0 = ~0;
+ tunnel_index1 = ~0;
+ error0 = 0;
+ error1 = 0;
+ next0 = NSH_INPUT_NEXT_DROP;
+ next1 = NSH_INPUT_NEXT_DROP;
+
+ if (PREDICT_FALSE(key0 != last_key))
+ {
+ p0 = hash_get (ngm->nsh_gre_tunnel_by_src_address, key0);
+
+ if (p0 == 0)
+ {
+ error0 = NSH_GRE_ERROR_NO_SUCH_TUNNEL;
+ goto trace0;
+ }
+
+ last_key = key0;
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (ngm->tunnels, tunnel_index0);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ next0 = t0->decap_next_index;
+
+ /* ip[46] lookup in the configured FIB, otherwise an opaque */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->decap_fib_index;
+
+ trace0:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->h = h0[0];
+ }
+
+ if (PREDICT_FALSE(key1 != last_key))
+ {
+ p1 = hash_get (ngm->nsh_gre_tunnel_by_src_address, key1);
+
+ if (p1 == 0)
+ {
+ error1 = NSH_GRE_ERROR_NO_SUCH_TUNNEL;
+ goto trace1;
+ }
+
+ last_key = key1;
+ tunnel_index1 = last_tunnel_index = p1[0];
+ }
+ else
+ tunnel_index1 = last_tunnel_index;
+
+ t1 = pool_elt_at_index (ngm->tunnels, tunnel_index1);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b1);
+
+ next1 = t1->decap_next_index;
+
+ /* ip[46] lookup in the configured FIB, otherwise an opaque */
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->decap_fib_index;
+
+ pkts_decapsulated +=2;
+
+ trace1:
+ b1->error = error1 ? node->errors[error1] : 0;
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->next_index = next1;
+ tr->error = error1;
+ tr->tunnel_index = tunnel_index1;
+ tr->h = h1[0];
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ nsh_header_t * h0;
+ uword * p0;
+ u32 tunnel_index0;
+ nsh_gre_tunnel_t * t0;
+ u64 key0;
+ u32 error0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ h0 = vlib_buffer_get_current (b0);
+
+ /* gre stashed the src ip4 address for us... */
+ key0 = (((u64)(vnet_buffer(b0)->gre.src))<<32) | h0->spi_si;
+
+ /* "pop" nsh header */
+ vlib_buffer_advance (b0, sizeof (*h0));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+ next0 = NSH_INPUT_NEXT_DROP;
+
+ if (PREDICT_FALSE(key0 != last_key))
+ {
+ p0 = hash_get (ngm->nsh_gre_tunnel_by_src_address, key0);
+
+ if (p0 == 0)
+ {
+ error0 = NSH_GRE_ERROR_NO_SUCH_TUNNEL;
+ goto trace00;
+ }
+
+ last_key = key0;
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (ngm->tunnels, tunnel_index0);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ next0 = t0->decap_next_index;
+
+ /* ip[46] lookup in the configured FIB, otherwise an opaque */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->decap_fib_index;
+ pkts_decapsulated ++;
+
+ trace00:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_rx_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->h = h0[0];
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, nsh_gre_input_node.index,
+ NSH_GRE_ERROR_DECAPSULATED,
+ pkts_decapsulated);
+ return from_frame->n_vectors;
+}
+
+static char * nsh_error_strings[] = {
+#define nsh_gre_error(n,s) s,
+#include <vnet/nsh-gre/nsh_gre_error.def>
+#undef nsh_gre_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (nsh_gre_input_node) = {
+ .function = nsh_gre_input,
+ .name = "nsh-gre-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = NSH_GRE_N_ERROR,
+ .error_strings = nsh_error_strings,
+
+ .n_next_nodes = NSH_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [NSH_INPUT_NEXT_##s] = n,
+ foreach_nsh_gre_input_next
+#undef _
+ },
+
+ .format_buffer = format_nsh_header_with_length,
+ .format_trace = format_nsh_rx_trace,
+ // $$$$ .unformat_buffer = unformat_nsh_gre_header,
+};
diff --git a/vnet/vnet/nsh-gre/encap.c b/vnet/vnet/nsh-gre/encap.c
new file mode 100644
index 00000000000..875e8311098
--- /dev/null
+++ b/vnet/vnet/nsh-gre/encap.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/nsh-gre/nsh_gre.h>
+
+/* Statistics (not really errors) */
+#define foreach_nsh_gre_encap_error \
+_(ENCAPSULATED, "good packets encapsulated")
+
+static char * nsh_gre_encap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_nsh_gre_encap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) NSH_GRE_ENCAP_ERROR_##sym,
+ foreach_nsh_gre_encap_error
+#undef _
+ NSH_GRE_ENCAP_N_ERROR,
+} nsh_gre_encap_error_t;
+
+typedef enum {
+ NSH_GRE_ENCAP_NEXT_IP4_LOOKUP,
+ NSH_GRE_ENCAP_NEXT_DROP,
+ NSH_GRE_ENCAP_N_NEXT,
+} nsh_gre_encap_next_t;
+
+typedef struct {
+ u32 tunnel_index;
+} nsh_gre_encap_trace_t;
+
+u8 * format_nsh_gre_encap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ nsh_gre_encap_trace_t * t = va_arg (*args, nsh_gre_encap_trace_t *);
+
+ s = format (s, "NSH-GRE-ENCAP: tunnel %d", t->tunnel_index);
+ return s;
+}
+
+static uword
+nsh_gre_encap (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+ vnet_main_t * vnm = ngm->vnet_main;
+ u32 pkts_encapsulated = 0;
+ u16 old_l0 = 0, old_l1 = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0 = NSH_GRE_ENCAP_NEXT_IP4_LOOKUP;
+ u32 next1 = NSH_GRE_ENCAP_NEXT_IP4_LOOKUP;
+ vnet_hw_interface_t * hi0, * hi1;
+ ip4_header_t * ip0, * ip1;
+ u64 * copy_src0, * copy_dst0;
+ u64 * copy_src1, * copy_dst1;
+ nsh_gre_tunnel_t * t0, * t1;
+ u16 new_l0, new_l1;
+ ip_csum_t sum0, sum1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ hi0 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+ hi1 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b1)->sw_if_index[VLIB_TX]);
+
+ t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
+ t1 = pool_elt_at_index (ngm->tunnels, hi1->dev_instance);
+
+ ASSERT(vec_len(t0->rewrite) >= 24);
+ ASSERT(vec_len(t1->rewrite) >= 24);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+ vlib_buffer_advance (b1, -(word)_vec_len(t1->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ ip1 = vlib_buffer_get_current(b1);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+ copy_dst1 = (u64 *) ip1;
+ copy_src1 = (u64 *) t1->rewrite;
+
+ copy_dst0[0] = copy_src0[0];
+ copy_dst0[1] = copy_src0[1];
+ copy_dst0[2] = copy_src0[2];
+
+ copy_dst1[0] = copy_src1[0];
+ copy_dst1[1] = copy_src1[1];
+ copy_dst1[2] = copy_src1[2];
+
+ /* If there are TLVs to copy, do so */
+ if (PREDICT_FALSE (_vec_len(t0->rewrite) > 24))
+ memcpy (&copy_dst0[3], t0->rewrite + 24 ,
+ _vec_len (t0->rewrite)-24);
+
+ if (PREDICT_FALSE (_vec_len(t1->rewrite) > 24))
+ memcpy (&copy_dst1[3], t1->rewrite + 24 ,
+ _vec_len (t1->rewrite)-24);
+
+ /* fix the <bleep>ing outer-IP checksums */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ sum1 = ip1->checksum;
+ /* old_l1 always 1, see the rewrite setup */
+ new_l1 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
+
+ sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t,
+ length /* changed member */);
+ ip1->checksum = ip_csum_fold (sum1);
+ ip1->length = new_l1;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->encap_fib_index;
+ pkts_encapsulated += 2;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_gre_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - ngm->tunnels;
+ }
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_gre_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b1, sizeof (*tr));
+ tr->tunnel_index = t1 - ngm->tunnels;
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = NSH_GRE_ENCAP_NEXT_IP4_LOOKUP;
+ vnet_hw_interface_t * hi0;
+ ip4_header_t * ip0;
+ u64 * copy_src0, * copy_dst0;
+ nsh_gre_tunnel_t * t0;
+ u16 new_l0;
+ ip_csum_t sum0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* 1-wide cache? */
+ hi0 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+
+ t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
+
+ ASSERT(vec_len(t0->rewrite) >= 24);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+ copy_dst0[0] = copy_src0[0];
+ copy_dst0[1] = copy_src0[1];
+ copy_dst0[2] = copy_src0[2];
+
+ /* If there are TLVs to copy, do so */
+ if (PREDICT_FALSE (_vec_len(t0->rewrite) > 24))
+ memcpy (&copy_dst0[3], t0->rewrite + 24 ,
+ _vec_len (t0->rewrite)-24);
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ pkts_encapsulated ++;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_gre_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - ngm->tunnels;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, node->node_index,
+ NSH_GRE_ENCAP_ERROR_ENCAPSULATED,
+ pkts_encapsulated);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (nsh_gre_encap_node) = {
+ .function = nsh_gre_encap,
+ .name = "nsh-gre-encap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_nsh_gre_encap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(nsh_gre_encap_error_strings),
+ .error_strings = nsh_gre_encap_error_strings,
+
+ .n_next_nodes = NSH_GRE_ENCAP_N_NEXT,
+
+ // add dispositions here
+ .next_nodes = {
+ [NSH_GRE_ENCAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [NSH_GRE_ENCAP_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/nsh-gre/nsh_gre.c b/vnet/vnet/nsh-gre/nsh_gre.c
new file mode 100644
index 00000000000..f85e71a2d43
--- /dev/null
+++ b/vnet/vnet/nsh-gre/nsh_gre.c
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/nsh-gre/nsh_gre.h>
+
+nsh_gre_main_t nsh_gre_main;
+
+static u8 * format_decap_next (u8 * s, va_list * args)
+{
+ u32 next_index = va_arg (*args, u32);
+
+ switch (next_index)
+ {
+ case NSH_INPUT_NEXT_DROP:
+ return format (s, "drop");
+ case NSH_INPUT_NEXT_IP4_INPUT:
+ return format (s, "ip4");
+ case NSH_INPUT_NEXT_IP6_INPUT:
+ return format (s, "ip6");
+ default:
+ return format (s, "index %d", next_index);
+ }
+ return s;
+}
+
+
+u8 * format_nsh_gre_tunnel (u8 * s, va_list * args)
+{
+ nsh_gre_tunnel_t * t = va_arg (*args, nsh_gre_tunnel_t *);
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+
+ s = format (s, "[%d] %U (src) %U (dst) fibs: (encap %d, decap %d)",
+ t - ngm->tunnels,
+ format_ip4_address, &t->src,
+ format_ip4_address, &t->dst,
+ t->encap_fib_index,
+ t->decap_fib_index);
+
+ s = format (s, " decap-next %U\n", format_decap_next, t->decap_next_index);
+
+ s = format (s, " ver %d ", (t->ver_o_c>>6));
+ if (t->ver_o_c & NSH_GRE_O_BIT)
+ s = format (s, "O-set ");
+
+ if (t->ver_o_c & NSH_GRE_C_BIT)
+ s = format (s, "C-set ");
+
+ s = format (s, "len %d (%d bytes) md_type %d next_protocol %d\n",
+ t->length, t->length * 4, t->md_type, t->next_protocol);
+
+ s = format (s, " service path %d service index %d\n",
+ (t->spi_si>>NSH_GRE_SPI_SHIFT) & NSH_GRE_SPI_MASK,
+ t->spi_si & NSH_GRE_SINDEX_MASK);
+
+ s = format (s, " c1 %d c2 %d c3 %d c4 %d\n",
+ t->c1, t->c2, t->c3, t->c4);
+
+ return s;
+}
+
+static u8 * format_nsh_gre_name (u8 * s, va_list * args)
+{
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+
+ if (i < vec_len (ngm->dev_inst_by_real))
+ show_dev_instance = ngm->dev_inst_by_real[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ return format (s, "nsh_gre_tunnel%d", i);
+}
+
+static int nsh_gre_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+
+ vec_validate_init_empty (ngm->dev_inst_by_real, hi->dev_instance, ~0);
+
+ ngm->dev_inst_by_real [hi->dev_instance] = new_dev_instance;
+
+ return 0;
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (nsh_gre_device_class,static) = {
+ .name = "NSH_GRE",
+ .format_device_name = format_nsh_gre_name,
+ .format_tx_trace = format_nsh_gre_encap_trace,
+ .tx_function = dummy_interface_tx,
+ .name_renumber = nsh_gre_name_renumber,
+};
+
+static uword dummy_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ return 0;
+}
+
+static u8 * format_nsh_gre_header_with_length (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ s = format (s, "unimplemented dev %u", dev_instance);
+ return s;
+}
+
+VNET_HW_INTERFACE_CLASS (nsh_gre_hw_class) = {
+ .name = "NSH_GRE",
+ .format_header = format_nsh_gre_header_with_length,
+ .set_rewrite = dummy_set_rewrite,
+};
+
+#define foreach_copy_field \
+_(src.as_u32) \
+_(dst.as_u32) \
+_(encap_fib_index) \
+_(decap_fib_index) \
+_(decap_next_index) \
+_(ver_o_c) \
+_(length) \
+_(md_type) \
+_(next_protocol) \
+_(spi_si) \
+_(c1) \
+_(c2) \
+_(c3) \
+_(c4) \
+_(tlvs)
+
+#define foreach_32bit_field \
+_(spi_si) \
+_(c1) \
+_(c2) \
+_(c3) \
+_(c4)
+
+static int nsh_gre_rewrite (nsh_gre_tunnel_t * t)
+{
+ u8 *rw = 0;
+ ip4_header_t * ip0;
+ nsh_header_t * nsh0;
+ ip4_gre_and_nsh_header_t * h0;
+ int len;
+
+ len = sizeof (*h0) + vec_len(t->tlvs)*4;
+
+ vec_validate_aligned (rw, len-1, CLIB_CACHE_LINE_BYTES);
+
+ h0 = (ip4_gre_and_nsh_header_t *) rw;
+
+ /* Fixed portion of the (outer) ip4 header */
+ ip0 = &h0->ip4;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_GRE;
+ /* we fix up the ip4 header length and checksum after-the-fact */
+ ip0->src_address.as_u32 = t->src.as_u32;
+ ip0->dst_address.as_u32 = t->dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ /* GRE header, zero execpt for the NSH ethertype */
+ h0->gre.protocol = clib_host_to_net_u16(GRE_PROTOCOL_nsh);
+
+ /* NSH header */
+ nsh0 = &h0->nsh;
+ nsh0->ver_o_c = t->ver_o_c;
+ nsh0->md_type = t->md_type;
+ nsh0->next_protocol = t->next_protocol;
+ nsh0->spi_si = t->spi_si;
+ nsh0->c1 = t->c1;
+ nsh0->c2 = t->c2;
+ nsh0->c3 = t->c3;
+ nsh0->c4 = t->c4;
+
+ /* Endian swap 32-bit fields */
+#define _(x) nsh0->x = clib_host_to_net_u32(nsh0->x);
+ foreach_32bit_field;
+#undef _
+
+ /* fix nsh header length */
+ t->length = 6 + vec_len(t->tlvs);
+ nsh0->length = t->length;
+
+ /* Copy any TLVs */
+ if (vec_len(t->tlvs))
+ memcpy (nsh0->tlvs, t->tlvs, 4*vec_len(t->tlvs));
+
+ t->rewrite = rw;
+ return (0);
+}
+
+int vnet_nsh_gre_add_del_tunnel (vnet_nsh_gre_add_del_tunnel_args_t *a,
+ u32 * sw_if_indexp)
+{
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+ nsh_gre_tunnel_t *t = 0;
+ vnet_main_t * vnm = ngm->vnet_main;
+ vnet_hw_interface_t * hi;
+ uword * p;
+ u32 hw_if_index = ~0;
+ u32 sw_if_index = ~0;
+ int rv;
+ u64 key;
+ u32 spi_si_net_byte_order;
+
+ spi_si_net_byte_order = clib_host_to_net_u32(a->spi_si);
+
+ key = (((u64)(a->src.as_u32))<<32) | spi_si_net_byte_order;
+
+ p = hash_get (ngm->nsh_gre_tunnel_by_src_address, key);
+
+ if (a->is_add)
+ {
+ /* adding a tunnel: tunnel must not already exist */
+ if (p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ if (a->decap_next_index >= NSH_INPUT_N_NEXT)
+ return VNET_API_ERROR_INVALID_DECAP_NEXT;
+
+ pool_get_aligned (ngm->tunnels, t, CLIB_CACHE_LINE_BYTES);
+ memset (t, 0, sizeof (*t));
+
+ /* copy from arg structure */
+#define _(x) t->x = a->x;
+ foreach_copy_field;
+#undef _
+
+ rv = nsh_gre_rewrite (t);
+
+ if (rv)
+ {
+ pool_put (ngm->tunnels, t);
+ return rv;
+ }
+
+ hash_set (ngm->nsh_gre_tunnel_by_src_address, key, t - ngm->tunnels);
+
+ if (vec_len (ngm->free_nsh_gre_tunnel_hw_if_indices) > 0)
+ {
+ hw_if_index = ngm->free_nsh_gre_tunnel_hw_if_indices
+ [vec_len (ngm->free_nsh_gre_tunnel_hw_if_indices)-1];
+ _vec_len (ngm->free_nsh_gre_tunnel_hw_if_indices) -= 1;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = t - ngm->tunnels;
+ hi->hw_instance = hi->dev_instance;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, nsh_gre_device_class.index, t - ngm->tunnels,
+ nsh_gre_hw_class.index, t - ngm->tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = nsh_gre_encap_node.index;
+ }
+
+ t->hw_if_index = hw_if_index;
+ t->sw_if_index = sw_if_index = hi->sw_if_index;
+
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+ }
+ else
+ {
+ /* deleting a tunnel: tunnel must exist */
+ if (!p)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ t = pool_elt_at_index (ngm->tunnels, p[0]);
+
+ vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */);
+ vec_add1 (ngm->free_nsh_gre_tunnel_hw_if_indices, t->hw_if_index);
+
+ hash_unset (ngm->nsh_gre_tunnel_by_src_address, key);
+ vec_free (t->rewrite);
+ pool_put (ngm->tunnels, t);
+ }
+
+ if (sw_if_indexp)
+ *sw_if_indexp = sw_if_index;
+
+ return 0;
+}
+
+static u32 fib_index_from_fib_id (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (!p)
+ return ~0;
+
+ return p[0];
+}
+
+static uword unformat_decap_next (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 tmp;
+
+ if (unformat (input, "drop"))
+ *result = NSH_INPUT_NEXT_DROP;
+ else if (unformat (input, "ip4"))
+ *result = NSH_INPUT_NEXT_IP4_INPUT;
+ else if (unformat (input, "ip6"))
+ *result = NSH_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "ethernet"))
+ *result = NSH_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "%d", &tmp))
+ *result = tmp;
+ else
+ return 0;
+ return 1;
+}
+
+static clib_error_t *
+nsh_gre_add_del_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t src, dst;
+ u8 is_add = 1;
+ u8 src_set = 0;
+ u8 dst_set = 0;
+ u32 encap_fib_index = 0;
+ u32 decap_fib_index = 0;
+ u8 ver_o_c = 0;
+ u8 length = 0;
+ u8 md_type = 0;
+ u8 next_protocol = 1; /* ip4 */
+ u32 spi;
+ u8 spi_set = 0;
+ u32 si;
+ u8 si_set = 0;
+ u32 spi_si;
+ u32 c1 = 0;
+ u32 c2 = 0;
+ u32 c3 = 0;
+ u32 c4 = 0;
+ u32 decap_next_index = 1; /* ip4_input */
+ u32 *tlvs = 0;
+ u32 tmp;
+ int rv;
+ vnet_nsh_gre_add_del_tunnel_args_t _a, * a = &_a;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "src %U",
+ unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (line_input, "dst %U",
+ unformat_ip4_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "encap-vrf-id %d", &tmp))
+ {
+ encap_fib_index = fib_index_from_fib_id (tmp);
+ if (encap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent encap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-vrf-id %d", &tmp))
+ {
+ decap_fib_index = fib_index_from_fib_id (tmp);
+ if (decap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent decap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-next %U", unformat_decap_next,
+ &decap_next_index))
+ ;
+ else if (unformat (line_input, "version %d", &tmp))
+ ver_o_c |= (tmp & 3) << 6;
+ else if (unformat (line_input, "o-bit %d", &tmp))
+ ver_o_c |= (tmp & 1) << 5;
+ else if (unformat (line_input, "c-bit %d", &tmp))
+ ver_o_c |= (tmp & 1) << 4;
+ else if (unformat (line_input, "md-type %d", &tmp))
+ md_type = tmp;
+ else if (unformat(line_input, "next-ip4"))
+ next_protocol = 1;
+ else if (unformat(line_input, "next-ip6"))
+ next_protocol = 2;
+ else if (unformat(line_input, "next-ethernet"))
+ next_protocol = 3;
+ else if (unformat (line_input, "c1 %d", &c1))
+ ;
+ else if (unformat (line_input, "c2 %d", &c2))
+ ;
+ else if (unformat (line_input, "c3 %d", &c3))
+ ;
+ else if (unformat (line_input, "c4 %d", &c4))
+ ;
+ else if (unformat (line_input, "spi %d", &spi))
+ spi_set = 1;
+ else if (unformat (line_input, "si %d", &si))
+ si_set = 1;
+ else if (unformat (line_input, "tlv %x"))
+ vec_add1 (tlvs, tmp);
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (src_set == 0)
+ return clib_error_return (0, "tunnel src address not specified");
+
+ if (dst_set == 0)
+ return clib_error_return (0, "tunnel dst address not specified");
+
+ if (spi_set == 0)
+ return clib_error_return (0, "spi not specified");
+
+ if (si_set == 0)
+ return clib_error_return (0, "si not specified");
+
+ spi_si = (spi<<8) | si;
+
+ memset (a, 0, sizeof (*a));
+
+ a->is_add = is_add;
+
+#define _(x) a->x = x;
+ foreach_copy_field;
+#undef _
+
+ rv = vnet_nsh_gre_add_del_tunnel (a, 0 /* hw_if_indexp */);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_DECAP_NEXT:
+ return clib_error_return (0, "invalid decap-next...");
+
+ case VNET_API_ERROR_TUNNEL_EXIST:
+ return clib_error_return (0, "tunnel already exists...");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "session does not exist...");
+
+ default:
+ return clib_error_return
+ (0, "vnet_nsh_gre_add_del_tunnel returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_nsh_gre_tunnel_command, static) = {
+ .path = "nsh gre tunnel",
+ .short_help =
+ "nsh gre tunnel src <ip4-addr> dst <ip4-addr>"
+ " c1 <nn> c2 <nn> c3 <nn> c4 <nn> spi <nn> si <nn>\n"
+ " [encap-fib-id <nn>] [decap-fib-id <nn>] [o-bit <1|0>] [c-bit <1|0>]\n"
+ " [md-type <nn>][next-ip4][next-ip6][next-ethernet]\n"
+ " [tlv <xx>]\n",
+ .function = nsh_gre_add_del_tunnel_command_fn,
+};
+
+static clib_error_t *
+show_nsh_gre_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ nsh_gre_main_t * ngm = &nsh_gre_main;
+ nsh_gre_tunnel_t * t;
+
+ if (pool_elts (ngm->tunnels) == 0)
+ vlib_cli_output (vm, "No nsh-gre tunnels configured...");
+
+ pool_foreach (t, ngm->tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_nsh_gre_tunnel, t);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_nsh_gre_tunnel_command, static) = {
+ .path = "show nsh gre tunnel",
+ .function = show_nsh_gre_tunnel_command_fn,
+};
+
+clib_error_t *nsh_gre_init (vlib_main_t *vm)
+{
+ nsh_gre_main_t *ngm = &nsh_gre_main;
+
+ ngm->vnet_main = vnet_get_main();
+ ngm->vlib_main = vm;
+
+ ngm->nsh_gre_tunnel_by_src_address = hash_create (0, sizeof (uword));
+ gre_register_input_protocol (vm, GRE_PROTOCOL_nsh,
+ nsh_gre_input_node.index);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(nsh_gre_init);
+
diff --git a/vnet/vnet/nsh-gre/nsh_gre.h b/vnet/vnet/nsh-gre/nsh_gre.h
new file mode 100644
index 00000000000..c82c80f6a68
--- /dev/null
+++ b/vnet/vnet/nsh-gre/nsh_gre.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_nsh_gre_h
+#define included_vnet_nsh_gre_h
+
+#include <vnet/vnet.h>
+#include <vnet/gre/gre.h>
+#include <vnet/nsh-gre/nsh_gre_packet.h>
+#include <vnet/ip/ip4_packet.h>
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4; /* 20 bytes */
+ gre_header_t gre; /* 4 bytes */
+ nsh_header_t nsh; /* 28 bytes */
+}) ip4_gre_and_nsh_header_t;
+
+typedef struct {
+ /* Rewrite string. $$$$ embed vnet_rewrite header */
+ u8 * rewrite;
+
+ /* tunnel src and dst addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+
+ /* FIB indices */
+ u32 encap_fib_index; /* tunnel partner lookup here */
+ u32 decap_fib_index; /* inner IP lookup here */
+
+ /* when decapsulating, send pkts here */
+ u32 decap_next_index;
+
+ /* vnet intfc hw/sw_if_index */
+ u32 hw_if_index;
+ u32 sw_if_index;
+
+ /* NSH header fields in HOST byte order */
+ u8 ver_o_c;
+ u8 length;
+ u8 md_type;
+ u8 next_protocol;
+ u32 spi_si;
+
+ /* Context headers, always present, in HOST byte order */
+ u32 c1, c2, c3, c4;
+ u32 * tlvs;
+} nsh_gre_tunnel_t;
+
+#define foreach_nsh_gre_input_next \
+ _ (DROP, "error-drop") \
+ _ (IP4_INPUT, "ip4-input") \
+ _ (IP6_INPUT, "ip6-input") \
+ _ (ETHERNET_INPUT, "ethernet-input")
+
+typedef enum {
+#define _(s,n) NSH_INPUT_NEXT_##s,
+ foreach_nsh_gre_input_next
+#undef _
+ NSH_INPUT_N_NEXT,
+} nsh_gre_input_next_t;
+
+typedef enum {
+#define nsh_gre_error(n,s) NSH_GRE_ERROR_##n,
+#include <vnet/nsh-gre/nsh_gre_error.def>
+#undef nsh_gre_error
+ NSH_GRE_N_ERROR,
+} nsh_gre_input_error_t;
+
+typedef struct {
+ /* vector of encap tunnel instances */
+ nsh_gre_tunnel_t *tunnels;
+
+ /* lookup tunnel by tunnel partner src address */
+ uword * nsh_gre_tunnel_by_src_address;
+
+ /* Free vlib hw_if_indices */
+ u32 * free_nsh_gre_tunnel_hw_if_indices;
+
+ /* show device instance by real device instance */
+ u32 * dev_inst_by_real;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} nsh_gre_main_t;
+
+nsh_gre_main_t nsh_gre_main;
+
+vlib_node_registration_t nsh_gre_input_node;
+vlib_node_registration_t nsh_gre_encap_node;
+
+u8 * format_nsh_gre_encap_trace (u8 * s, va_list * args);
+
+typedef struct {
+ u8 is_add;
+ ip4_address_t src, dst;
+ u32 encap_fib_index;
+ u32 decap_fib_index;
+ u32 decap_next_index;
+ u8 ver_o_c;
+ u8 length;
+ u8 md_type;
+ u8 next_protocol;
+ u32 spi_si;
+ u32 c1, c2, c3, c4;
+ u32 * tlvs;
+} vnet_nsh_gre_add_del_tunnel_args_t;
+
+int vnet_nsh_gre_add_del_tunnel (vnet_nsh_gre_add_del_tunnel_args_t *a,
+ u32 * sw_if_indexp);
+
+#endif /* included_vnet_nsh_gre_h */
diff --git a/vnet/vnet/nsh-gre/nsh_gre_error.def b/vnet/vnet/nsh-gre/nsh_gre_error.def
new file mode 100644
index 00000000000..532b02a6e89
--- /dev/null
+++ b/vnet/vnet/nsh-gre/nsh_gre_error.def
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+nsh_gre_error (DECAPSULATED, "good packets decapsulated")
+nsh_gre_error (NO_SUCH_TUNNEL, "no such tunnel packets")
+nsh_gre_error (INVALID_NEXT_PROTOCOL, "invalid next protocol")
diff --git a/vnet/vnet/nsh-gre/nsh_gre_packet.h b/vnet/vnet/nsh-gre/nsh_gre_packet.h
new file mode 100644
index 00000000000..0620f227b9f
--- /dev/null
+++ b/vnet/vnet/nsh-gre/nsh_gre_packet.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_nsh_gre_packet_h
+#define included_vnet_nsh_gre_packet_h
+
+/*
+ * NSH_GRE packet format from draft-quinn-sfc-nsh-03.txt
+ *
+ * NSH Base Header
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |Ver|O|C|R|R|R|R|R|R| Length | MD Type | Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *
+ * Base Header Field Descriptions:
+ *
+ * Version: The version field is used to ensure backward compatibility
+ * going forward with future NSH updates.
+ *
+ * O bit: Indicates that this packet is an operations and management
+ * (OAM) packet. SFF and SFs nodes MUST examine the payload and take
+ * appropriate action (e.g. return status information).
+ *
+ * OAM message specifics and handling details are outside the scope of
+ * this document.
+ *
+ * C bit: Indicates that a critical metadata TLV is present (see section
+ * 7). This bit acts as an indication for hardware implementers to
+ * decide how to handle the presence of a critical TLV without
+ * necessarily needing to parse all TLVs present. The C bit MUST be set
+ * to 1 if one or more critical TLVs are present.
+ *
+ * All other flag fields are reserved.
+ *
+ * Length: total length, in 4 byte words, of the NSH header, including
+ * optional variable TLVs. Length must be equal or greater than 6.
+ *
+ * MD Type: indicates the format of NSH beyond the base header and the
+ * type of metadata being carried. This typing is used to describe the
+ * use for the metadata. A new registry will be requested from IANA for
+ * the MD Type. NSH defines one type, type = 0x1 which indicates that
+ * the format of the header is as per this draft.
+ *
+ * The format of the base header is invariant, and not described by MD
+ * Type.
+ *
+ * Next Protocol: indicates the protocol type of the original packet. A
+ * new IANA registry will be created for protocol type.
+ *
+ * This draft defines the following Next Protocol values:
+ *
+ * 0x1 : IPv4
+ * 0x2 : IPv6
+ * 0x3 : Ethernet
+ */
+
+typedef CLIB_PACKED(struct {
+ u8 ver_o_c;
+ u8 length;
+ u8 md_type;
+ u8 next_protocol;
+ u32 spi_si;
+ /* Context headers, always present */
+ u32 c1; u32 c2; u32 c3; u32 c4;
+
+ /* Optional variable length metadata */
+ u32 tlvs[0];
+}) nsh_header_t;
+
+#define NSH_GRE_VERSION (0<<6)
+#define NSH_GRE_O_BIT (1<<5)
+#define NSH_GRE_C_BIT (1<<4)
+
+/* Network byte order shift / mask */
+#define NSH_GRE_SINDEX_MASK 0xFF
+#define NSH_GRE_SPI_MASK (0x00FFFFFF)
+#define NSH_GRE_SPI_SHIFT 8
+
+#endif /* included_vnet_nsh_gre_packet_h */
diff --git a/vnet/vnet/nsh-vxlan-gpe/decap.c b/vnet/vnet/nsh-vxlan-gpe/decap.c
new file mode 100644
index 00000000000..62bb0f81dc7
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/decap.c
@@ -0,0 +1,365 @@
+/*
+ * nsh.c: nsh packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h>
+
+vlib_node_registration_t nsh_vxlan_gpe_input_node;
+
+/* From nsh-gre */
+u8 * format_nsh_header_with_length (u8 * s, va_list * args);
+
+typedef struct {
+ u32 next_index;
+ u32 tunnel_index;
+ u32 error;
+ nsh_header_t h;
+} nsh_vxlan_gpe_rx_trace_t;
+
+static u8 * format_nsh_vxlan_gpe_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ nsh_vxlan_gpe_rx_trace_t * t = va_arg (*args, nsh_vxlan_gpe_rx_trace_t *);
+
+ if (t->tunnel_index != ~0)
+ {
+ s = format (s, "NSH-VXLAN: tunnel %d next %d error %d", t->tunnel_index,
+ t->next_index, t->error);
+ }
+ else
+ {
+ s = format (s, "NSH-VXLAN: no tunnel next %d error %d\n", t->next_index,
+ t->error);
+ }
+ s = format (s, "\n %U", format_nsh_header_with_length, &t->h,
+ (u32) sizeof (t->h) /* max size */);
+ return s;
+}
+
+static uword
+nsh_vxlan_gpe_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+ u32 last_tunnel_index = ~0;
+ nsh_vxlan_gpe_tunnel_key_t last_key;
+ u32 pkts_decapsulated = 0;
+
+ memset (&last_key, 0xff, sizeof (last_key));
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ ip4_vxlan_gpe_and_nsh_header_t * iuvn0, * iuvn1;
+ uword * p0, * p1;
+ u32 tunnel_index0, tunnel_index1;
+ nsh_vxlan_gpe_tunnel_t * t0, * t1;
+ nsh_vxlan_gpe_tunnel_key_t key0, key1;
+ u32 error0, error1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* udp leaves current_data pointing at the vxlan header */
+ vlib_buffer_advance
+ (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+ vlib_buffer_advance
+ (b1, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+
+ iuvn0 = vlib_buffer_get_current (b0);
+ iuvn1 = vlib_buffer_get_current (b1);
+
+ /* pop (ip, udp, vxlan, nsh) */
+ vlib_buffer_advance (b0, sizeof (*iuvn0));
+ vlib_buffer_advance (b1, sizeof (*iuvn1));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+ next0 = NSH_VXLAN_GPE_INPUT_NEXT_DROP;
+
+ tunnel_index1 = ~0;
+ error1 = 0;
+ next1 = NSH_VXLAN_GPE_INPUT_NEXT_DROP;
+
+ key0.src = iuvn0->ip4.src_address.as_u32;
+ key0.vni = iuvn0->vxlan.vni_res;
+ key0.spi_si = iuvn0->nsh.spi_si;
+ key0.pad = 0;
+
+ if (PREDICT_FALSE ((key0.as_u64[0] != last_key.as_u64[0])
+ || (key0.as_u64[1] != last_key.as_u64[1])))
+ {
+ p0 = hash_get_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, &key0);
+
+ if (p0 == 0)
+ {
+ error0 = NSH_VXLAN_GPE_ERROR_NO_SUCH_TUNNEL;
+ goto trace0;
+ }
+
+ last_key.as_u64[0] = key0.as_u64[0];
+ last_key.as_u64[1] = key0.as_u64[1];
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (ngm->tunnels, tunnel_index0);
+
+ next0 = t0->decap_next_index;
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ /*
+ * ip[46] lookup in the configured FIB
+ * nsh-vxlan-gpe-encap, here's the encap tunnel sw_if_index
+ */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->decap_fib_index;
+
+ trace0:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->h = iuvn0->nsh;
+ }
+
+ key1.src = iuvn1->ip4.src_address.as_u32;
+ key1.vni = iuvn1->vxlan.vni_res;
+ key1.spi_si = iuvn1->nsh.spi_si;
+ key1.pad = 0;
+
+ if (PREDICT_FALSE ((key1.as_u64[0] != last_key.as_u64[0])
+ || (key1.as_u64[1] != last_key.as_u64[1])))
+ {
+ p1 = hash_get_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, &key1);
+
+ if (p1 == 0)
+ {
+ error1 = NSH_VXLAN_GPE_ERROR_NO_SUCH_TUNNEL;
+ goto trace1;
+ }
+
+ last_key.as_u64[0] = key1.as_u64[0];
+ last_key.as_u64[1] = key1.as_u64[1];
+ tunnel_index1 = last_tunnel_index = p1[0];
+ }
+ else
+ tunnel_index1 = last_tunnel_index;
+
+ t1 = pool_elt_at_index (ngm->tunnels, tunnel_index1);
+
+ next1 = t1->decap_next_index;
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b1);
+
+ /*
+ * ip[46] lookup in the configured FIB
+ * nsh-vxlan-gpe-encap, here's the encap tunnel sw_if_index
+ */
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->decap_fib_index;
+ pkts_decapsulated += 2;
+
+ trace1:
+ b1->error = error1 ? node->errors[error1] : 0;
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b1, sizeof (*tr));
+ tr->next_index = next1;
+ tr->error = error1;
+ tr->tunnel_index = tunnel_index1;
+ tr->h = iuvn1->nsh;
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ ip4_vxlan_gpe_and_nsh_header_t * iuvn0;
+ uword * p0;
+ u32 tunnel_index0;
+ nsh_vxlan_gpe_tunnel_t * t0;
+ nsh_vxlan_gpe_tunnel_key_t key0;
+ u32 error0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* udp leaves current_data pointing at the vxlan header */
+ vlib_buffer_advance
+ (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+
+ iuvn0 = vlib_buffer_get_current (b0);
+
+ /* pop (ip, udp, vxlan, nsh) */
+ vlib_buffer_advance (b0, sizeof (*iuvn0));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+ next0 = NSH_VXLAN_GPE_INPUT_NEXT_DROP;
+
+ key0.src = iuvn0->ip4.src_address.as_u32;
+ key0.vni = iuvn0->vxlan.vni_res;
+ key0.spi_si = iuvn0->nsh.spi_si;
+ key0.pad = 0;
+
+ if (PREDICT_FALSE ((key0.as_u64[0] != last_key.as_u64[0])
+ || (key0.as_u64[1] != last_key.as_u64[1])))
+ {
+ p0 = hash_get_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, &key0);
+
+ if (p0 == 0)
+ {
+ error0 = NSH_VXLAN_GPE_ERROR_NO_SUCH_TUNNEL;
+ goto trace00;
+ }
+
+ last_key.as_u64[0] = key0.as_u64[0];
+ last_key.as_u64[1] = key0.as_u64[1];
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (ngm->tunnels, tunnel_index0);
+
+ next0 = t0->decap_next_index;
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ /*
+ * ip[46] lookup in the configured FIB
+ * nsh-vxlan-gpe-encap, here's the encap tunnel sw_if_index
+ */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->decap_fib_index;
+ pkts_decapsulated ++;
+
+ trace00:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->h = iuvn0->nsh;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, nsh_vxlan_gpe_input_node.index,
+ NSH_VXLAN_GPE_ERROR_DECAPSULATED,
+ pkts_decapsulated);
+ return from_frame->n_vectors;
+}
+
+static char * nsh_vxlan_gpe_error_strings[] = {
+#define nsh_vxlan_gpe_error(n,s) s,
+#include <vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def>
+#undef nsh_vxlan_gpe_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (nsh_vxlan_gpe_input_node) = {
+ .function = nsh_vxlan_gpe_input,
+ .name = "nsh-vxlan-gpe-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = NSH_VXLAN_GPE_N_ERROR,
+ .error_strings = nsh_vxlan_gpe_error_strings,
+
+ .n_next_nodes = NSH_VXLAN_GPE_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [NSH_VXLAN_GPE_INPUT_NEXT_##s] = n,
+ foreach_nsh_vxlan_gpe_input_next
+#undef _
+ },
+
+ .format_buffer = format_nsh_header_with_length,
+ .format_trace = format_nsh_vxlan_gpe_rx_trace,
+ // $$$$ .unformat_buffer = unformat_nsh_vxlan_gpe_header,
+};
diff --git a/vnet/vnet/nsh-vxlan-gpe/encap.c b/vnet/vnet/nsh-vxlan-gpe/encap.c
new file mode 100644
index 00000000000..0ccdf60c6aa
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/encap.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h>
+
+/* Statistics (not really errors) */
+#define foreach_nsh_vxlan_gpe_encap_error \
+_(ENCAPSULATED, "good packets encapsulated")
+
+static char * nsh_vxlan_gpe_encap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_nsh_vxlan_gpe_encap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) NSH_VXLAN_GPE_ENCAP_ERROR_##sym,
+ foreach_nsh_vxlan_gpe_encap_error
+#undef _
+ NSH_VXLAN_GPE_ENCAP_N_ERROR,
+} nsh_vxlan_gpe_encap_error_t;
+
+typedef enum {
+ NSH_VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP,
+ NSH_VXLAN_GPE_ENCAP_NEXT_DROP,
+ NSH_VXLAN_GPE_ENCAP_N_NEXT,
+} nsh_vxlan_gpe_encap_next_t;
+
+typedef struct {
+ u32 tunnel_index;
+} nsh_vxlan_gpe_encap_trace_t;
+
+u8 * format_nsh_vxlan_gpe_encap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ nsh_vxlan_gpe_encap_trace_t * t
+ = va_arg (*args, nsh_vxlan_gpe_encap_trace_t *);
+
+ s = format (s, "NSH-VXLAN-ENCAP: tunnel %d", t->tunnel_index);
+ return s;
+}
+
+#define foreach_fixed_header_offset \
+_(0) _(1) _(2) _(3) _(4) _(5) _(6)
+
+static uword
+nsh_vxlan_gpe_encap (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+ vnet_main_t * vnm = ngm->vnet_main;
+ u32 pkts_encapsulated = 0;
+ u16 old_l0 = 0, old_l1 = 0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0 = NSH_VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP;
+ u32 next1 = NSH_VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP;
+ vnet_hw_interface_t * hi0, * hi1;
+ ip4_header_t * ip0, * ip1;
+ udp_header_t * udp0, * udp1;
+ u64 * copy_src0, * copy_dst0;
+ u64 * copy_src1, * copy_dst1;
+ u32 * copy_src_last0, * copy_dst_last0;
+ u32 * copy_src_last1, * copy_dst_last1;
+ nsh_vxlan_gpe_tunnel_t * t0, * t1;
+ u16 new_l0, new_l1;
+ ip_csum_t sum0, sum1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* 1-wide cache? */
+ hi0 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+ hi1 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b1)->sw_if_index[VLIB_TX]);
+
+ t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
+ t1 = pool_elt_at_index (ngm->tunnels, hi1->dev_instance);
+
+ ASSERT(vec_len(t0->rewrite) >= 24);
+ ASSERT(vec_len(t1->rewrite) >= 24);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+ vlib_buffer_advance (b1, -(word)_vec_len(t1->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ ip1 = vlib_buffer_get_current(b1);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+ copy_dst1 = (u64 *) ip1;
+ copy_src1 = (u64 *) t1->rewrite;
+
+ ASSERT (sizeof (ip4_vxlan_gpe_and_nsh_header_t) == 60);
+
+ /* Copy first 56 octets 8-bytes at a time */
+#define _(offs) copy_dst0[offs] = copy_src0[offs];
+ foreach_fixed_header_offset;
+#undef _
+#define _(offs) copy_dst1[offs] = copy_src1[offs];
+ foreach_fixed_header_offset;
+#undef _
+
+ /* Last 4 octets. Hopefully gcc will be our friend */
+ copy_dst_last0 = (u32 *)(&copy_dst0[7]);
+ copy_src_last0 = (u32 *)(&copy_src0[7]);
+ copy_dst_last1 = (u32 *)(&copy_dst1[7]);
+ copy_src_last1 = (u32 *)(&copy_src1[7]);
+
+ copy_dst_last0[0] = copy_src_last0[0];
+ copy_dst_last1[0] = copy_src_last1[0];
+
+ /* If there are TLVs to copy, do so */
+ if (PREDICT_FALSE (_vec_len(t0->rewrite) > 64))
+ memcpy (&copy_dst0[3], t0->rewrite + 64 ,
+ _vec_len (t0->rewrite)-64);
+
+ if (PREDICT_FALSE (_vec_len(t1->rewrite) > 64))
+ memcpy (&copy_dst0[3], t1->rewrite + 64 ,
+ _vec_len (t1->rewrite)-64);
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ sum1 = ip1->checksum;
+ /* old_l1 always 0, see the rewrite setup */
+ new_l1 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
+
+ sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t,
+ length /* changed member */);
+ ip1->checksum = ip_csum_fold (sum1);
+ ip1->length = new_l1;
+
+ /* Fix UDP length */
+ udp0 = (udp_header_t *)(ip0+1);
+ new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
+ - sizeof (*ip0));
+ udp1 = (udp_header_t *)(ip1+1);
+ new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)
+ - sizeof (*ip1));
+
+ udp0->length = new_l0;
+ udp1->length = new_l1;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->encap_fib_index;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - ngm->tunnels;
+ }
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b1, sizeof (*tr));
+ tr->tunnel_index = t1 - ngm->tunnels;
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0 = NSH_VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP;
+ vnet_hw_interface_t * hi0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ u64 * copy_src0, * copy_dst0;
+ u32 * copy_src_last0, * copy_dst_last0;
+ nsh_vxlan_gpe_tunnel_t * t0;
+ u16 new_l0;
+ ip_csum_t sum0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* 1-wide cache? */
+ hi0 = vnet_get_sup_hw_interface
+ (vnm, vnet_buffer(b0)->sw_if_index[VLIB_TX]);
+
+ t0 = pool_elt_at_index (ngm->tunnels, hi0->dev_instance);
+
+ ASSERT(vec_len(t0->rewrite) >= 24);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+
+ ASSERT (sizeof (ip4_vxlan_gpe_and_nsh_header_t) == 60);
+
+ /* Copy first 56 octets 8-bytes at a time */
+#define _(offs) copy_dst0[offs] = copy_src0[offs];
+ foreach_fixed_header_offset;
+#undef _
+ /* Last 4 octets. Hopefully gcc will be our friend */
+ copy_dst_last0 = (u32 *)(&copy_dst0[7]);
+ copy_src_last0 = (u32 *)(&copy_src0[7]);
+
+ copy_dst_last0[0] = copy_src_last0[0];
+
+ /* If there are TLVs to copy, do so */
+ if (PREDICT_FALSE (_vec_len(t0->rewrite) > 64))
+ memcpy (&copy_dst0[3], t0->rewrite + 64 ,
+ _vec_len (t0->rewrite)-64);
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ /* Fix UDP length */
+ udp0 = (udp_header_t *)(ip0+1);
+ new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
+ - sizeof (*ip0));
+
+ udp0->length = new_l0;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ pkts_encapsulated ++;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ nsh_vxlan_gpe_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - ngm->tunnels;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, node->node_index,
+ NSH_VXLAN_GPE_ENCAP_ERROR_ENCAPSULATED,
+ pkts_encapsulated);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (nsh_vxlan_gpe_encap_node) = {
+ .function = nsh_vxlan_gpe_encap,
+ .name = "nsh-vxlan-gpe-encap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_nsh_vxlan_gpe_encap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(nsh_vxlan_gpe_encap_error_strings),
+ .error_strings = nsh_vxlan_gpe_encap_error_strings,
+
+ .n_next_nodes = NSH_VXLAN_GPE_ENCAP_N_NEXT,
+
+ .next_nodes = {
+ [NSH_VXLAN_GPE_ENCAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [NSH_VXLAN_GPE_ENCAP_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.c b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.c
new file mode 100644
index 00000000000..8cc46d3d3eb
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h>
+
+nsh_vxlan_gpe_main_t nsh_vxlan_gpe_main;
+
+static u8 * format_decap_next (u8 * s, va_list * args)
+{
+ u32 next_index = va_arg (*args, u32);
+
+ switch (next_index)
+ {
+ case NSH_VXLAN_GPE_INPUT_NEXT_DROP:
+ return format (s, "drop");
+ case NSH_VXLAN_GPE_INPUT_NEXT_IP4_INPUT:
+ return format (s, "ip4");
+ case NSH_VXLAN_GPE_INPUT_NEXT_IP6_INPUT:
+ return format (s, "ip6");
+ case NSH_VXLAN_GPE_INPUT_NEXT_NSH_VXLAN_GPE_ENCAP:
+ return format (s, "nsh-vxlan-gpe");
+ default:
+ return format (s, "unknown %d", next_index);
+ }
+ return s;
+}
+
+u8 * format_nsh_vxlan_gpe_tunnel (u8 * s, va_list * args)
+{
+ nsh_vxlan_gpe_tunnel_t * t = va_arg (*args, nsh_vxlan_gpe_tunnel_t *);
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+
+ s = format (s,
+ "[%d] %U (src) %U (dst) fibs: encap %d, decap %d",
+ t - ngm->tunnels,
+ format_ip4_address, &t->src,
+ format_ip4_address, &t->dst,
+ t->encap_fib_index,
+ t->decap_fib_index);
+ s = format (s, " decap next %U\n", format_decap_next, t->decap_next_index);
+ s = format (s, " vxlan VNI %d ", t->vni);
+ s = format (s, "nsh ver %d ", (t->ver_o_c>>6));
+ if (t->ver_o_c & NSH_GRE_O_BIT)
+ s = format (s, "O-set ");
+
+ if (t->ver_o_c & NSH_GRE_C_BIT)
+ s = format (s, "C-set ");
+
+ s = format (s, "len %d (%d bytes) md_type %d next_protocol %d\n",
+ t->length, t->length * 4, t->md_type, t->next_protocol);
+
+ s = format (s, " service path %d service index %d\n",
+ (t->spi_si>>NSH_GRE_SPI_SHIFT) & NSH_GRE_SPI_MASK,
+ t->spi_si & NSH_GRE_SINDEX_MASK);
+
+ s = format (s, " c1 %d c2 %d c3 %d c4 %d\n",
+ t->c1, t->c2, t->c3, t->c4);
+
+ return s;
+}
+
+static u8 * format_nsh_vxlan_gpe_name (u8 * s, va_list * args)
+{
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+
+ if (i < vec_len (ngm->dev_inst_by_real))
+ show_dev_instance = ngm->dev_inst_by_real[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ return format (s, "nsh_vxlan_gpe_tunnel%d", i);
+}
+
+static int nsh_vxlan_gpe_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+
+ vec_validate_init_empty (ngm->dev_inst_by_real, hi->dev_instance, ~0);
+
+ ngm->dev_inst_by_real [hi->dev_instance] = new_dev_instance;
+
+ return 0;
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (nsh_vxlan_gpe_device_class,static) = {
+ .name = "NSH_VXLAN_GPE",
+ .format_device_name = format_nsh_vxlan_gpe_name,
+ .format_tx_trace = format_nsh_vxlan_gpe_encap_trace,
+ .tx_function = dummy_interface_tx,
+ .name_renumber = nsh_vxlan_gpe_name_renumber,
+};
+
+static uword dummy_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ return 0;
+}
+
+static u8 * format_nsh_vxlan_gpe_header_with_length (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ s = format (s, "unimplemented dev %u", dev_instance);
+ return s;
+}
+
+VNET_HW_INTERFACE_CLASS (nsh_vxlan_gpe_hw_class) = {
+ .name = "NSH_VXLAN_GPE",
+ .format_header = format_nsh_vxlan_gpe_header_with_length,
+ .set_rewrite = dummy_set_rewrite,
+};
+
+#define foreach_copy_field \
+_(src.as_u32) \
+_(dst.as_u32) \
+_(vni) \
+_(encap_fib_index) \
+_(decap_fib_index) \
+_(decap_next_index) \
+_(ver_o_c) \
+_(length) \
+_(md_type) \
+_(next_protocol) \
+_(spi_si) \
+_(c1) \
+_(c2) \
+_(c3) \
+_(c4) \
+_(tlvs)
+
+#define foreach_32bit_field \
+_(spi_si) \
+_(c1) \
+_(c2) \
+_(c3) \
+_(c4)
+
+static int nsh_vxlan_gpe_rewrite (nsh_vxlan_gpe_tunnel_t * t)
+{
+ u8 *rw = 0;
+ ip4_header_t * ip0;
+ nsh_header_t * nsh0;
+ ip4_vxlan_gpe_and_nsh_header_t * h0;
+ int len;
+
+ len = sizeof (*h0) + vec_len(t->tlvs)*4;
+
+ vec_validate_aligned (rw, len-1, CLIB_CACHE_LINE_BYTES);
+
+ h0 = (ip4_vxlan_gpe_and_nsh_header_t *) rw;
+
+ /* Fixed portion of the (outer) ip4 header */
+ ip0 = &h0->ip4;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_UDP;
+
+ /* we fix up the ip4 header length and checksum after-the-fact */
+ ip0->src_address.as_u32 = t->src.as_u32;
+ ip0->dst_address.as_u32 = t->dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ /* UDP header, randomize src port on something, maybe? */
+ h0->udp.src_port = clib_host_to_net_u16 (4790);
+ h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_vxlan_gpe);
+
+ /* VXLAN header. Are we having fun yet? */
+ h0->vxlan.flags = VXLAN_GPE_FLAGS_I | VXLAN_GPE_FLAGS_P;
+ h0->vxlan.ver_res = VXLAN_GPE_VERSION;
+ h0->vxlan.next_protocol = VXLAN_NEXT_PROTOCOL_NSH;
+ h0->vxlan.vni_res = clib_host_to_net_u32 (t->vni<<8);
+
+ /* NSH header */
+ nsh0 = &h0->nsh;
+ nsh0->ver_o_c = t->ver_o_c;
+ nsh0->md_type = t->md_type;
+ nsh0->next_protocol = t->next_protocol;
+ nsh0->spi_si = t->spi_si;
+ nsh0->c1 = t->c1;
+ nsh0->c2 = t->c2;
+ nsh0->c3 = t->c3;
+ nsh0->c4 = t->c4;
+
+ /* Endian swap 32-bit fields */
+#define _(x) nsh0->x = clib_host_to_net_u32(nsh0->x);
+ foreach_32bit_field;
+#undef _
+
+ /* fix nsh header length */
+ t->length = 6 + vec_len(t->tlvs);
+ nsh0->length = t->length;
+
+ /* Copy any TLVs */
+ if (vec_len(t->tlvs))
+ memcpy (nsh0->tlvs, t->tlvs, 4*vec_len(t->tlvs));
+
+ t->rewrite = rw;
+ return (0);
+}
+
+int vnet_nsh_vxlan_gpe_add_del_tunnel
+(vnet_nsh_vxlan_gpe_add_del_tunnel_args_t *a, u32 * sw_if_indexp)
+{
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+ nsh_vxlan_gpe_tunnel_t *t = 0;
+ vnet_main_t * vnm = ngm->vnet_main;
+ vnet_hw_interface_t * hi;
+ uword * p;
+ u32 hw_if_index = ~0;
+ u32 sw_if_index = ~0;
+ int rv;
+ nsh_vxlan_gpe_tunnel_key_t key, *key_copy;
+ hash_pair_t *hp;
+
+ key.src = a->dst.as_u32; /* decap src in key is encap dst in config */
+ key.vni = clib_host_to_net_u32 (a->vni << 8);
+ key.spi_si = clib_host_to_net_u32(a->spi_si);
+
+ p = hash_get_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, &key);
+
+ if (a->is_add)
+ {
+ /* adding a tunnel: tunnel must not already exist */
+ if (p)
+ return VNET_API_ERROR_INVALID_VALUE;
+
+ if (a->decap_next_index >= NSH_VXLAN_GPE_INPUT_N_NEXT)
+ return VNET_API_ERROR_INVALID_DECAP_NEXT;
+
+ pool_get_aligned (ngm->tunnels, t, CLIB_CACHE_LINE_BYTES);
+ memset (t, 0, sizeof (*t));
+
+ /* copy from arg structure */
+#define _(x) t->x = a->x;
+ foreach_copy_field;
+#undef _
+
+ rv = nsh_vxlan_gpe_rewrite (t);
+
+ if (rv)
+ {
+ pool_put (ngm->tunnels, t);
+ return rv;
+ }
+
+ key_copy = clib_mem_alloc (sizeof (*key_copy));
+ memcpy (key_copy, &key, sizeof (*key_copy));
+
+ hash_set_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, key_copy,
+ t - ngm->tunnels);
+
+ if (vec_len (ngm->free_nsh_vxlan_gpe_tunnel_hw_if_indices) > 0)
+ {
+ hw_if_index = ngm->free_nsh_vxlan_gpe_tunnel_hw_if_indices
+ [vec_len (ngm->free_nsh_vxlan_gpe_tunnel_hw_if_indices)-1];
+ _vec_len (ngm->free_nsh_vxlan_gpe_tunnel_hw_if_indices) -= 1;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = t - ngm->tunnels;
+ hi->hw_instance = hi->dev_instance;
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, nsh_vxlan_gpe_device_class.index, t - ngm->tunnels,
+ nsh_vxlan_gpe_hw_class.index, t - ngm->tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = nsh_vxlan_gpe_encap_node.index;
+ }
+
+ t->hw_if_index = hw_if_index;
+ t->sw_if_index = sw_if_index = hi->sw_if_index;
+
+ vnet_sw_interface_set_flags (vnm, hi->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+ }
+ else
+ {
+ /* deleting a tunnel: tunnel must exist */
+ if (!p)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ t = pool_elt_at_index (ngm->tunnels, p[0]);
+
+ vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */);
+ vec_add1 (ngm->free_nsh_vxlan_gpe_tunnel_hw_if_indices, t->hw_if_index);
+
+ hp = hash_get_pair (ngm->nsh_vxlan_gpe_tunnel_by_key, &key);
+ key_copy = (void *)(hp->key);
+ hash_unset_mem (ngm->nsh_vxlan_gpe_tunnel_by_key, &key);
+ clib_mem_free (key_copy);
+
+ vec_free (t->rewrite);
+ pool_put (ngm->tunnels, t);
+ }
+
+ if (sw_if_indexp)
+ *sw_if_indexp = sw_if_index;
+
+ return 0;
+}
+
+static u32 fib_index_from_fib_id (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (!p)
+ return ~0;
+
+ return p[0];
+}
+
+static uword unformat_decap_next (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 tmp;
+
+ if (unformat (input, "drop"))
+ *result = NSH_VXLAN_GPE_INPUT_NEXT_DROP;
+ else if (unformat (input, "ip4"))
+ *result = NSH_VXLAN_GPE_INPUT_NEXT_IP4_INPUT;
+ else if (unformat (input, "ip6"))
+ *result = NSH_VXLAN_GPE_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "ethernet"))
+ *result = NSH_VXLAN_GPE_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "nsh-vxlan-gpe"))
+ *result = NSH_VXLAN_GPE_INPUT_NEXT_NSH_VXLAN_GPE_ENCAP;
+ else if (unformat (input, "%d", &tmp))
+ *result = tmp;
+ else
+ return 0;
+ return 1;
+}
+
+static clib_error_t *
+nsh_vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t src, dst;
+ u8 is_add = 1;
+ u8 src_set = 0;
+ u8 dst_set = 0;
+ u32 encap_fib_index = 0;
+ u32 decap_fib_index = 0;
+ u8 ver_o_c = 0;
+ u8 length = 0;
+ u8 md_type = 0;
+ u8 next_protocol = 1; /* default: ip4 */
+ u32 decap_next_index = NSH_VXLAN_GPE_INPUT_NEXT_IP4_INPUT;
+ u32 spi;
+ u8 spi_set = 0;
+ u32 si;
+ u32 vni;
+ u8 vni_set = 0;
+ u8 si_set = 0;
+ u32 spi_si;
+ u32 c1 = 0;
+ u32 c2 = 0;
+ u32 c3 = 0;
+ u32 c4 = 0;
+ u32 *tlvs = 0;
+ u32 tmp;
+ int rv;
+ vnet_nsh_vxlan_gpe_add_del_tunnel_args_t _a, * a = &_a;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "src %U",
+ unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (line_input, "dst %U",
+ unformat_ip4_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "encap-vrf-id %d", &tmp))
+ {
+ encap_fib_index = fib_index_from_fib_id (tmp);
+ if (encap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent encap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-vrf-id %d", &tmp))
+ {
+ decap_fib_index = fib_index_from_fib_id (tmp);
+ if (decap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent decap fib id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-next %U", unformat_decap_next,
+ &decap_next_index))
+ ;
+ else if (unformat (line_input, "vni %d", &vni))
+ vni_set = 1;
+ else if (unformat (line_input, "version %d", &tmp))
+ ver_o_c |= (tmp & 3) << 6;
+ else if (unformat (line_input, "o-bit %d", &tmp))
+ ver_o_c |= (tmp & 1) << 5;
+ else if (unformat (line_input, "c-bit %d", &tmp))
+ ver_o_c |= (tmp & 1) << 4;
+ else if (unformat (line_input, "md-type %d", &tmp))
+ md_type = tmp;
+ else if (unformat(line_input, "next-ip4"))
+ next_protocol = 1;
+ else if (unformat(line_input, "next-ip6"))
+ next_protocol = 2;
+ else if (unformat(line_input, "next-ethernet"))
+ next_protocol = 3;
+ else if (unformat(line_input, "next-nsh"))
+ next_protocol = 4;
+ else if (unformat (line_input, "c1 %d", &c1))
+ ;
+ else if (unformat (line_input, "c2 %d", &c2))
+ ;
+ else if (unformat (line_input, "c3 %d", &c3))
+ ;
+ else if (unformat (line_input, "c4 %d", &c4))
+ ;
+ else if (unformat (line_input, "spi %d", &spi))
+ spi_set = 1;
+ else if (unformat (line_input, "si %d", &si))
+ si_set = 1;
+ else if (unformat (line_input, "tlv %x"))
+ vec_add1 (tlvs, tmp);
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (src_set == 0)
+ return clib_error_return (0, "tunnel src address not specified");
+
+ if (dst_set == 0)
+ return clib_error_return (0, "tunnel dst address not specified");
+
+ if (vni_set == 0)
+ return clib_error_return (0, "vni not specified");
+
+ if (spi_set == 0)
+ return clib_error_return (0, "spi not specified");
+
+ if (si_set == 0)
+ return clib_error_return (0, "si not specified");
+
+ spi_si = (spi<<8) | si;
+
+ memset (a, 0, sizeof (*a));
+
+ a->is_add = is_add;
+
+#define _(x) a->x = x;
+ foreach_copy_field;
+#undef _
+
+ rv = vnet_nsh_vxlan_gpe_add_del_tunnel (a, 0 /* hw_if_indexp */);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_DECAP_NEXT:
+ return clib_error_return (0, "invalid decap-next...");
+
+ case VNET_API_ERROR_TUNNEL_EXIST:
+ return clib_error_return (0, "tunnel already exists...");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel does not exist...");
+
+ default:
+ return clib_error_return
+ (0, "vnet_nsh_vxlan_gpe_add_del_tunnel returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_nsh_vxlan_gpe_tunnel_command, static) = {
+ .path = "nsh vxlan tunnel",
+ .short_help =
+ "nsh vxlan tunnel src <ip4-addr> dst <ip4-addr>"
+ " c1 <nn> c2 <nn> c3 <nn> c4 <nn> spi <nn> si <nn> vni <nn>\n"
+ " [encap-fib-id <nn>] [decap-fib-id <nn>] [o-bit <1|0>] [c-bit <1|0>]\n"
+ " [md-type <nn>][next-ip4][next-ip6][next-ethernet][next-nsh]\n"
+ " [tlv <xx>][decap-next [ip4|ip6|ethernet|nsh-encap]][del]\n",
+ .function = nsh_vxlan_gpe_add_del_tunnel_command_fn,
+};
+
+static clib_error_t *
+show_nsh_vxlan_gpe_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ nsh_vxlan_gpe_main_t * ngm = &nsh_vxlan_gpe_main;
+ nsh_vxlan_gpe_tunnel_t * t;
+
+ if (pool_elts (ngm->tunnels) == 0)
+ vlib_cli_output (vm, "No nsh-vxlan-gpe tunnels configured...");
+
+ pool_foreach (t, ngm->tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_nsh_vxlan_gpe_tunnel, t);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_nsh_vxlan_gpe_tunnel_command, static) = {
+ .path = "show nsh vxlan tunnel",
+ .function = show_nsh_vxlan_gpe_tunnel_command_fn,
+};
+
+clib_error_t *nsh_vxlan_gpe_init (vlib_main_t *vm)
+{
+ nsh_vxlan_gpe_main_t *ngm = &nsh_vxlan_gpe_main;
+
+ ngm->vnet_main = vnet_get_main();
+ ngm->vlib_main = vm;
+
+ ngm->nsh_vxlan_gpe_tunnel_by_key
+ = hash_create_mem (0, sizeof(nsh_vxlan_gpe_tunnel_key_t), sizeof (uword));
+
+ udp_register_dst_port (vm, UDP_DST_PORT_vxlan_gpe,
+ nsh_vxlan_gpe_input_node.index, 1 /* is_ip4 */);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(nsh_vxlan_gpe_init);
+
diff --git a/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h
new file mode 100644
index 00000000000..953035a472b
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_nsh_vxlan_gpe_h
+#define included_vnet_nsh_vxlan_gpe_h
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/gre/gre.h>
+#include <vnet/nsh-gre/nsh_gre_packet.h>
+#include <vnet/nsh-vxlan-gpe/vxlan_gpe_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/udp.h>
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4; /* 20 bytes */
+ udp_header_t udp; /* 8 bytes */
+ vxlan_gpe_header_t vxlan; /* 8 bytes */
+ nsh_header_t nsh; /* 28 bytes */
+}) ip4_vxlan_gpe_and_nsh_header_t;
+
+typedef CLIB_PACKED(struct {
+ /*
+ * Key fields: ip src, vxlan vni, nsh spi_si
+ * all fields in NET byte order
+ */
+ union {
+ struct {
+ u32 src;
+ u32 vni; /* shifted 8 bits */
+ u32 spi_si;
+ u32 pad;
+ };
+ u64 as_u64[2];
+ };
+}) nsh_vxlan_gpe_tunnel_key_t;
+
+typedef struct {
+ /* Rewrite string. $$$$ embed vnet_rewrite header */
+ u8 * rewrite;
+
+ /* decap next index */
+ u32 decap_next_index;
+
+ /* tunnel src and dst addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+
+ /* FIB indices */
+ u32 encap_fib_index; /* tunnel partner lookup here */
+ u32 decap_fib_index; /* inner IP lookup here */
+
+ /* vxlan VNI in HOST byte order, shifted left 8 bits */
+ u32 vni;
+
+ /* vnet intfc hw/sw_if_index */
+ u32 hw_if_index;
+ u32 sw_if_index;
+
+ /* NSH header fields in HOST byte order */
+ u8 ver_o_c;
+ u8 length;
+ u8 md_type;
+ u8 next_protocol;
+ u32 spi_si;
+
+ /* Context headers, always present, in HOST byte order */
+ u32 c1, c2, c3, c4;
+ u32 * tlvs;
+} nsh_vxlan_gpe_tunnel_t;
+
+#define foreach_nsh_vxlan_gpe_input_next \
+_(DROP, "error-drop") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input") \
+_(ETHERNET_INPUT, "ethernet-input") \
+_(NSH_VXLAN_GPE_ENCAP, "nsh-vxlan-gpe-encap")
+
+typedef enum {
+#define _(s,n) NSH_VXLAN_GPE_INPUT_NEXT_##s,
+ foreach_nsh_vxlan_gpe_input_next
+#undef _
+ NSH_VXLAN_GPE_INPUT_N_NEXT,
+} nsh_vxlan_gpe_input_next_t;
+
+typedef enum {
+#define nsh_vxlan_gpe_error(n,s) NSH_VXLAN_GPE_ERROR_##n,
+#include <vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def>
+#undef nsh_vxlan_gpe_error
+ NSH_VXLAN_GPE_N_ERROR,
+} nsh_vxlan_gpe_input_error_t;
+
+typedef struct {
+ /* vector of encap tunnel instances */
+ nsh_vxlan_gpe_tunnel_t *tunnels;
+
+ /* lookup tunnel by key */
+ uword * nsh_vxlan_gpe_tunnel_by_key;
+
+ /* Free vlib hw_if_indices */
+ u32 * free_nsh_vxlan_gpe_tunnel_hw_if_indices;
+
+ /* show device instance by real device instance */
+ u32 * dev_inst_by_real;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} nsh_vxlan_gpe_main_t;
+
+nsh_vxlan_gpe_main_t nsh_vxlan_gpe_main;
+
+vlib_node_registration_t nsh_vxlan_gpe_input_node;
+vlib_node_registration_t nsh_vxlan_gpe_encap_node;
+
+u8 * format_nsh_vxlan_gpe_encap_trace (u8 * s, va_list * args);
+
+typedef struct {
+ u8 is_add;
+ ip4_address_t src, dst;
+ u32 encap_fib_index;
+ u32 decap_fib_index;
+ u32 decap_next_index;
+ u32 vni;
+ u8 ver_o_c;
+ u8 length;
+ u8 md_type;
+ u8 next_protocol;
+ u32 spi_si;
+ u32 c1, c2, c3, c4;
+ u32 * tlvs;
+} vnet_nsh_vxlan_gpe_add_del_tunnel_args_t;
+
+int vnet_nsh_vxlan_gpe_add_del_tunnel
+(vnet_nsh_vxlan_gpe_add_del_tunnel_args_t *a, u32 * sw_if_indexp);
+
+#endif /* included_vnet_nsh_vxlan_gpe_h */
diff --git a/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def
new file mode 100644
index 00000000000..4ba64fe4dc5
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/nsh_vxlan_gpe_error.def
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+nsh_vxlan_gpe_error (DECAPSULATED, "good packets decapsulated")
+nsh_vxlan_gpe_error (NO_SUCH_TUNNEL, "no such tunnel packets")
diff --git a/vnet/vnet/nsh-vxlan-gpe/vxlan-gpe-rfc.txt b/vnet/vnet/nsh-vxlan-gpe/vxlan-gpe-rfc.txt
new file mode 100644
index 00000000000..35cee50f573
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/vxlan-gpe-rfc.txt
@@ -0,0 +1,868 @@
+Network Working Group P. Quinn
+Internet-Draft Cisco Systems, Inc.
+Intended status: Experimental P. Agarwal
+Expires: January 4, 2015 Broadcom
+ R. Fernando
+ L. Kreeger
+ D. Lewis
+ F. Maino
+ M. Smith
+ N. Yadav
+ Cisco Systems, Inc.
+ L. Yong
+ Huawei USA
+ X. Xu
+ Huawei Technologies
+ U. Elzur
+ Intel
+ P. Garg
+ Microsoft
+ July 3, 2014
+
+
+ Generic Protocol Extension for VXLAN
+ draft-quinn-vxlan-gpe-03.txt
+
+Abstract
+
+ This draft describes extending Virtual eXtensible Local Area Network
+ (VXLAN), via changes to the VXLAN header, with three new
+ capabilities: support for multi-protocol encapsulation, operations,
+ administration and management (OAM) signaling and explicit
+ versioning.
+
+Status of this Memo
+
+ This Internet-Draft is submitted in full conformance with the
+ provisions of BCP 78 and BCP 79.
+
+ Internet-Drafts are working documents of the Internet Engineering
+ Task Force (IETF). Note that other groups may also distribute
+ working documents as Internet-Drafts. The list of current Internet-
+ Drafts is at http://datatracker.ietf.org/drafts/current/.
+
+ Internet-Drafts are draft documents valid for a maximum of six months
+ and may be updated, replaced, or obsoleted by other documents at any
+ time. It is inappropriate to use Internet-Drafts as reference
+ material or to cite them other than as "work in progress."
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 1]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+ This Internet-Draft will expire on January 4, 2015.
+
+Copyright Notice
+
+ Copyright (c) 2014 IETF Trust and the persons identified as the
+ document authors. All rights reserved.
+
+ This document is subject to BCP 78 and the IETF Trust's Legal
+ Provisions Relating to IETF Documents
+ (http://trustee.ietf.org/license-info) in effect on the date of
+ publication of this document. Please review these documents
+ carefully, as they describe your rights and restrictions with respect
+ to this document. Code Components extracted from this document must
+ include Simplified BSD License text as described in Section 4.e of
+ the Trust Legal Provisions and are provided without warranty as
+ described in the Simplified BSD License.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 2]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+Table of Contents
+
+ 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 4
+ 2. VXLAN Without Protocol Extension . . . . . . . . . . . . . . . 5
+ 3. Generic Protocol Extension VXLAN (VXLAN-gpe) . . . . . . . . . 6
+ 3.1. Multi Protocol Support . . . . . . . . . . . . . . . . . . 6
+ 3.2. OAM Support . . . . . . . . . . . . . . . . . . . . . . . 7
+ 3.3. Version Bits . . . . . . . . . . . . . . . . . . . . . . . 7
+ 4. Backward Compatibility . . . . . . . . . . . . . . . . . . . . 8
+ 4.1. VXLAN VTEP to VXLAN-gpe VTEP . . . . . . . . . . . . . . . 8
+ 4.2. VXLAN-gpe VTEP to VXLAN VTEP . . . . . . . . . . . . . . . 8
+ 4.3. VXLAN-gpe UDP Ports . . . . . . . . . . . . . . . . . . . 8
+ 4.4. VXLAN-gpe and Encapsulated IP Header Fields . . . . . . . 8
+ 5. VXLAN-gpe Examples . . . . . . . . . . . . . . . . . . . . . . 9
+ 6. Security Considerations . . . . . . . . . . . . . . . . . . . 11
+ 7. Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . 12
+ 8. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 13
+ 8.1. UDP Port . . . . . . . . . . . . . . . . . . . . . . . . . 13
+ 8.2. VXLAN-gpe Next Protocol . . . . . . . . . . . . . . . . . 13
+ 8.3. VXLAN-gpe Reserved Bits . . . . . . . . . . . . . . . . . 13
+ 9. References . . . . . . . . . . . . . . . . . . . . . . . . . . 14
+ 9.1. Normative References . . . . . . . . . . . . . . . . . . . 14
+ 9.2. Informative References . . . . . . . . . . . . . . . . . . 14
+ Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . . 15
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 3]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+1. Introduction
+
+ Virtual eXtensible Local Area Network [VXLAN] defines an
+ encapsulation format that encapsulates Ethernet frames in an outer
+ UDP/IP transport. As data centers evolve, the need to carry other
+ protocols encapsulated in an IP packet is required, as well as the
+ need to provide increased visibility and diagnostic capabilities
+ within the overlay. The VXLAN header does not specify the protocol
+ being encapsulated and therefore is currently limited to
+ encapsulating only Ethernet frame payload, nor does it provide the
+ ability to define OAM protocols. Rather than defining yet another
+ encapsulation, VXLAN is extended to provide protocol typing and OAM
+ capabilities.
+
+ This document describes extending VXLAN via the following changes:
+
+ Next Protocol Bit (P bit): A reserved flag bit is allocated, and set
+ in the VXLAN-gpe header to indicate that a next protocol field is
+ present.
+
+ OAM Flag Bit (O bit): A reserved flag bit is allocated, and set in
+ the VXLAN-gpe header, to indicate that the packet is an OAM
+ packet.
+
+ Version: Two reserved bits are allocated, and set in the VXLAN-gpe
+ header, to indicate VXLAN-gpe protocol version.
+
+ Next Protocol: A 8 bit next protocol field is present in the VXLAN-
+ gpe header.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 4]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+2. VXLAN Without Protocol Extension
+
+ As described in the introduction, the VXLAN header has no protocol
+ identifier that indicates the type of payload being carried by VXLAN.
+ Because of this, VXLAN is limited to an Ethernet payload.
+ Furthermore, the VXLAN header has no mechanism to signal OAM packets.
+
+ The VXLAN header defines bits 0-7 as flags (some defined, some
+ reserved), the VXLAN network identifier (VNI) field and several
+ reserved bits. The flags provide flexibility to define how the
+ reserved bits can be used to change the definition of the VXLAN
+ header.
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|R|R|R| Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+ Figure 1: VXLAN Header
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 5]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+3. Generic Protocol Extension VXLAN (VXLAN-gpe)
+
+3.1. Multi Protocol Support
+
+ This draft defines the following two changes to the VXLAN header in
+ order to support multi-protocol encapsulation:
+
+ P Bit: Flag bit 5 is defined as the Next Protocol bit. The P bit
+ MUST be set to 1 to indicate the presence of the 8 bit next
+ protocol field.
+
+ P = 0 indicates that the payload MUST conform to VXLAN as defined
+ in [VXLAN].
+
+ Flag bit 5 was chosen as the P bit because this flag bit is
+ currently reserved in VXLAN.
+
+ Next Protocol Field: The lower 8 bits of the first word are used to
+ carry a next protocol. This next protocol field contains the
+ protocol of the encapsulated payload packet. A new protocol
+ registry will be requested from IANA.
+
+ This draft defines the following Next Protocol values:
+
+ 0x1 : IPv4
+ 0x2 : IPv6
+ 0x3 : Ethernet
+ 0x4 : Network Service Header [NSH]
+
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|P|R|R| Reserved |Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 2: VXLAN-gpe Next Protocol
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 6]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+3.2. OAM Support
+
+ Flag bit 7 is defined as the O bit. When the O bit is set to 1, the
+ packet is an OAM packet and OAM processing MUST occur. The OAM
+ protocol details are out of scope for this document. As with the
+ P-bit, bit 7 is currently a reserved flag in VXLAN.
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|P|R|O| Reserved |Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 3: VXLAN-gpe OAM Bit
+
+3.3. Version Bits
+
+ VXLAN-gpe bits 8 and 9 are defined as version bits. These bits are
+ reserved in VXLAN. The version field is used to ensure backward
+ compatibility going forward with future VXLAN-gpe updates.
+
+ The initial version for VXLAN-gpe is 0.
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|P|R|O|Ver| Reserved |Next Protocol |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+
+ Figure 4: VXLAN-gpe Version Bits
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 7]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+4. Backward Compatibility
+
+4.1. VXLAN VTEP to VXLAN-gpe VTEP
+
+ As per VXLAN, reserved bits 5 and 7, VXLAN-gpe P and O-bits
+ respectively must be set to zero. The remaining reserved bits must
+ be zero, including the VXLAN-gpe version field, bits 8 and 9. The
+ encapsulated payload MUST be Ethernet.
+
+4.2. VXLAN-gpe VTEP to VXLAN VTEP
+
+ A VXLAN-gpe VTEP MUST NOT encapsulate non-Ethernet frames to a VXLAN
+ VTEP. When encapsulating Ethernet frames to a VXLAN VTEP, the VXLAN-
+ gpe VTEP will set the P bit to 0, the Next Protocol to 0 and use UDP
+ destination port 4789. A VXLAN-gpe VTEP MUST also set O = 0 and Ver
+ = 0 when encapsulating Ethernet frames to VXLAN VTEP. The receiving
+ VXLAN VTEP will threat this packet as a VXLAN packet.
+
+ A method for determining the capabilities of a VXLAN VTEP (gpe or
+ non-gpe) is out of the scope of this draft.
+
+4.3. VXLAN-gpe UDP Ports
+
+ VXLAN-gpe uses a new UDP destination port (to be assigned by IANA)
+ when sending traffic to VXLAN-gpe VTEPs.
+
+4.4. VXLAN-gpe and Encapsulated IP Header Fields
+
+ When encapsulating and decapsulating IPv4 and IPv6 packets, certain
+ fields, such as IPv4 Time to Live (TTL) from the inner IP header need
+ to be considered. VXLAN-gpe IP encapsulation and decapsulation
+ utilizes the techniques described in [RFC6830], section 5.3.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 8]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+5. VXLAN-gpe Examples
+
+ This section provides three examples of protocols encapsulated using
+ the Generic Protocol Extension for VXLAN described in this document.
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|1|R|0|0|0| Reserved | NP = IPv4 |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original IPv4 Packet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 5: IPv4 and VXLAN-gpe
+
+
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|1|R|0|0|0| Reserved | NP = IPv6 |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original IPv6 Packet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 6: IPv6 and VXLAN-gpe
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 9]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |R|R|R|R|I|1|R|0|0|0| Reserved |NP = Ethernet |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | VXLAN Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Original Ethernet Frame |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+ Figure 7: Ethernet and VXLAN-gpe
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 10]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+6. Security Considerations
+
+ VXLAN's security is focused on issues around L2 encapsulation into
+ L3. With VXLAN-gpe, issues such as spoofing, flooding, and traffic
+ redirection are dependent on the particular protocol payload
+ encapsulated.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 11]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+7. Acknowledgments
+
+ A special thank you goes to Dino Farinacci for his guidance and
+ detailed review.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 12]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+8. IANA Considerations
+
+8.1. UDP Port
+
+ A new UDP port will be requested from IANA.
+
+8.2. VXLAN-gpe Next Protocol
+
+ IANA is requested to set up a registry of "Next Protocol". These are
+ 8-bit values. Next Protocol values 0, 1, 2, 3 and 4 are defined in
+ this draft. New values are assigned via Standards Action [RFC5226].
+
+ +---------------+-------------+---------------+
+ | Next Protocol | Description | Reference |
+ +---------------+-------------+---------------+
+ | 0 | Reserved | This document |
+ | | | |
+ | 1 | IPv4 | This document |
+ | | | |
+ | 2 | IPv6 | This document |
+ | | | |
+ | 3 | Ethernet | This document |
+ | | | |
+ | 4 | NSH | This document |
+ | | | |
+ | 5..253 | Unassigned | |
+ +---------------+-------------+---------------+
+
+ Table 1
+
+8.3. VXLAN-gpe Reserved Bits
+
+ There are ten bits at the beginning of the VXLAN-gpe header. New
+ bits are assigned via Standards Action [RFC5226].
+
+ Bits 0-3 - Reserved
+ Bit 4 - Instance ID (I bit)
+ Bit 5 - Next Protocol (P bit)
+ Bit 6 - Reserved
+ Bit 7 - OAM (O bit)
+ Bits 8-9 - Version
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 13]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+9. References
+
+9.1. Normative References
+
+ [RFC0768] Postel, J., "User Datagram Protocol", STD 6, RFC 768,
+ August 1980.
+
+ [RFC0791] Postel, J., "Internet Protocol", STD 5, RFC 791,
+ September 1981.
+
+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
+ Requirement Levels", BCP 14, RFC 2119, March 1997.
+
+ [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an
+ IANA Considerations Section in RFCs", BCP 26, RFC 5226,
+ May 2008.
+
+9.2. Informative References
+
+ [NSH] Quinn, P. and et al. , "Network Service Header", 2014.
+
+ [RFC1700] Reynolds, J. and J. Postel, "Assigned Numbers", RFC 1700,
+ October 1994.
+
+ [RFC6830] Farinacci, D., Fuller, V., Meyer, D., and D. Lewis, "The
+ Locator/ID Separation Protocol (LISP)", RFC 6830,
+ January 2013.
+
+ [VXLAN] Dutt, D., Mahalingam, M., Duda, K., Agarwal, P., Kreeger,
+ L., Sridhar, T., Bursell, M., and C. Wright, "VXLAN: A
+ Framework for Overlaying Virtualized Layer 2 Networks over
+ Layer 3 Networks", 2013.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 14]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+Authors' Addresses
+
+ Paul Quinn
+ Cisco Systems, Inc.
+
+ Email: paulq@cisco.com
+
+
+ Puneet Agarwal
+ Broadcom
+
+ Email: pagarwal@broadcom.com
+
+
+ Rex Fernando
+ Cisco Systems, Inc.
+
+ Email: rex@cisco.com
+
+
+ Larry Kreeger
+ Cisco Systems, Inc.
+
+ Email: kreeger@cisco.com
+
+
+ Darrel Lewis
+ Cisco Systems, Inc.
+
+ Email: darlewis@cisco.com
+
+
+ Fabio Maino
+ Cisco Systems, Inc.
+
+ Email: kreeger@cisco.com
+
+
+ Michael Smith
+ Cisco Systems, Inc.
+
+ Email: michsmit@cisco.com
+
+
+
+
+
+
+
+
+
+Quinn, et al. Expires January 4, 2015 [Page 15]
+
+Internet-Draft Generic Protocol Extension for VXLAN July 2014
+
+
+ Navindra Yadav
+ Cisco Systems, Inc.
+
+ Email: nyadav@cisco.com
+
+
+ Lucy Yong
+ Huawei USA
+
+ Email: lucy.yong@huawei.com
+
+
+ Xiaohu Xu
+ Huawei Technologies
+
+ Email: xuxiaohu@huawei.com
+
+
+ Uri Elzur
+ Intel
+
+ Email: uri.elzur@intel.com
+
+
+ Pankaj Garg
+ Microsoft
+
+ Email: Garg.Pankaj@microsoft.com
diff --git a/vnet/vnet/nsh-vxlan-gpe/vxlan_gpe_packet.h b/vnet/vnet/nsh-vxlan-gpe/vxlan_gpe_packet.h
new file mode 100644
index 00000000000..efc85c4bb54
--- /dev/null
+++ b/vnet/vnet/nsh-vxlan-gpe/vxlan_gpe_packet.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vxlan_gpe_packet_h
+#define included_vxlan_gpe_packet_h
+
+/*
+ * From draft-quinn-vxlan-gpe-03.txt
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|P|R|O|Ver| Reserved |Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * I Bit: Flag bit 4 indicates that the VNI is valid.
+ *
+ * P Bit: Flag bit 5 is defined as the Next Protocol bit. The P bit
+ * MUST be set to 1 to indicate the presence of the 8 bit next
+ * protocol field.
+ *
+ * O Bit: Flag bit 7 is defined as the O bit. When the O bit is set to 1,
+ *
+ * the packet is an OAM packet and OAM processing MUST occur. The OAM
+ * protocol details are out of scope for this document. As with the
+ * P-bit, bit 7 is currently a reserved flag in VXLAN.
+ *
+ * VXLAN-gpe bits 8 and 9 are defined as version bits. These bits are
+ * reserved in VXLAN. The version field is used to ensure backward
+ * compatibility going forward with future VXLAN-gpe updates.
+ *
+ * The initial version for VXLAN-gpe is 0.
+ *
+ * This draft defines the following Next Protocol values:
+ *
+ * 0x1 : IPv4
+ * 0x2 : IPv6
+ * 0x3 : Ethernet
+ * 0x4 : Network Service Header [NSH]
+ */
+
+typedef struct {
+ u8 flags;
+ u8 ver_res;
+ u8 res;
+ u8 next_protocol;
+ u32 vni_res;
+} vxlan_gpe_header_t;
+
+#define VXLAN_GPE_FLAGS_I 0x08
+#define VXLAN_GPE_FLAGS_P 0x04
+#define VXLAN_GPE_FLAGS_O 0x01
+
+#define VXLAN_GPE_VERSION 0x0
+
+#define VXLAN_NEXT_PROTOCOL_IP4 0x1
+#define VXLAN_NEXT_PROTOCOL_IP6 0x2
+#define VXLAN_NEXT_PROTOCOL_ETHERNET 0x3
+#define VXLAN_NEXT_PROTOCOL_NSH 0x4
+
+#endif /* included_vxlan_gpe_packet_h */
diff --git a/vnet/vnet/osi/node.c b/vnet/vnet/osi/node.c
new file mode 100644
index 00000000000..723cf47f25a
--- /dev/null
+++ b/vnet/vnet/osi/node.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * osi_node.c: osi packet processing
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/osi/osi.h>
+#include <vnet/ppp/ppp.h>
+#include <vnet/hdlc/hdlc.h>
+#include <vnet/llc/llc.h>
+
+#define foreach_osi_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) OSI_INPUT_NEXT_##s,
+ foreach_osi_input_next
+#undef _
+ OSI_INPUT_N_NEXT,
+} osi_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} osi_input_trace_t;
+
+static u8 * format_osi_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ osi_input_trace_t * t = va_arg (*va, osi_input_trace_t *);
+
+ s = format (s, "%U", format_osi_header, t->packet_data);
+
+ return s;
+}
+
+static uword
+osi_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ osi_main_t * lm = &osi_main;
+ u32 n_left_from, next_index, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (osi_input_trace_t));
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ osi_header_t * h0, * h1;
+ u8 next0, next1, enqueue_code;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+
+ CLIB_PREFETCH (b2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = (void *) (b0->data + b0->current_data);
+ h1 = (void *) (b1->data + b1->current_data);
+
+ next0 = lm->input_next_by_protocol[h0->protocol];
+ next1 = lm->input_next_by_protocol[h1->protocol];
+
+ b0->error = node->errors[next0 == OSI_INPUT_NEXT_DROP ? OSI_ERROR_UNKNOWN_PROTOCOL : OSI_ERROR_NONE];
+ b1->error = node->errors[next1 == OSI_INPUT_NEXT_DROP ? OSI_ERROR_UNKNOWN_PROTOCOL : OSI_ERROR_NONE];
+
+ enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = bi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ if (next0 == next1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ next_index = next1;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ osi_header_t * h0;
+ u8 next0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ next0 = lm->input_next_by_protocol[h0->protocol];
+
+ b0->error = node->errors[next0 == OSI_INPUT_NEXT_DROP ? OSI_ERROR_UNKNOWN_PROTOCOL : OSI_ERROR_NONE];
+
+ /* Sent packet to wrong next? */
+ if (PREDICT_FALSE (next0 != next_index))
+ {
+ /* Return old frame; remove incorrectly enqueued packet. */
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);
+
+ /* Send to correct next. */
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * osi_error_strings[] = {
+#define _(f,s) s,
+ foreach_osi_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (osi_input_node) = {
+ .function = osi_input,
+ .name = "osi-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = OSI_N_ERROR,
+ .error_strings = osi_error_strings,
+
+ .n_next_nodes = OSI_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [OSI_INPUT_NEXT_##s] = n,
+ foreach_osi_input_next
+#undef _
+ },
+
+ .format_buffer = format_osi_header_with_length,
+ .format_trace = format_osi_input_trace,
+ .unformat_buffer = unformat_osi_header,
+};
+
+static clib_error_t * osi_input_init (vlib_main_t * vm)
+{
+ clib_error_t * error = 0;
+ osi_main_t * lm = &osi_main;
+
+ if ((error = vlib_call_init_function (vm, osi_init)))
+ return error;
+
+ osi_setup_node (vm, osi_input_node.index);
+
+ {
+ int i;
+ for (i = 0; i < ARRAY_LEN (lm->input_next_by_protocol); i++)
+ lm->input_next_by_protocol[i] = OSI_INPUT_NEXT_DROP;
+ }
+
+ ppp_register_input_protocol (vm, PPP_PROTOCOL_osi, osi_input_node.index);
+ hdlc_register_input_protocol (vm, HDLC_PROTOCOL_osi, osi_input_node.index);
+ llc_register_input_protocol (vm, LLC_PROTOCOL_osi_layer1, osi_input_node.index);
+ llc_register_input_protocol (vm, LLC_PROTOCOL_osi_layer2, osi_input_node.index);
+ llc_register_input_protocol (vm, LLC_PROTOCOL_osi_layer3, osi_input_node.index);
+ llc_register_input_protocol (vm, LLC_PROTOCOL_osi_layer4, osi_input_node.index);
+ llc_register_input_protocol (vm, LLC_PROTOCOL_osi_layer5, osi_input_node.index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (osi_input_init);
+
+void
+osi_register_input_protocol (osi_protocol_t protocol,
+ u32 node_index)
+{
+ osi_main_t * lm = &osi_main;
+ vlib_main_t * vm = lm->vlib_main;
+ osi_protocol_info_t * pi;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, osi_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = osi_get_protocol_info (lm, protocol);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ osi_input_node.index,
+ node_index);
+
+ lm->input_next_by_protocol[protocol] = pi->next_index;
+}
diff --git a/vnet/vnet/osi/osi.c b/vnet/vnet/osi/osi.c
new file mode 100644
index 00000000000..7313dab1937
--- /dev/null
+++ b/vnet/vnet/osi/osi.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * osi.c: osi support
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/osi/osi.h>
+
+/* Global main structure. */
+osi_main_t osi_main;
+
+u8 * format_osi_protocol (u8 * s, va_list * args)
+{
+ osi_protocol_t p = va_arg (*args, u32);
+ osi_main_t * pm = &osi_main;
+ osi_protocol_info_t * pi = osi_get_protocol_info (pm, p);
+
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "0x%02x", p);
+
+ return s;
+}
+
+u8 * format_osi_header_with_length (u8 * s, va_list * args)
+{
+ osi_main_t * pm = &osi_main;
+ osi_header_t * h = va_arg (*args, osi_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ osi_protocol_t p = h->protocol;
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "osi header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "OSI %U", format_osi_protocol, p);
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ {
+ osi_protocol_info_t * pi = osi_get_protocol_info (pm, p);
+ vlib_node_t * node = vlib_get_node (pm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_osi_header (u8 * s, va_list * args)
+{
+ osi_header_t * h = va_arg (*args, osi_header_t *);
+ return format (s, "%U", format_osi_header_with_length, h, 0);
+}
+
+/* Returns osi protocol as an int in host byte order. */
+uword
+unformat_osi_protocol (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ osi_main_t * pm = &osi_main;
+ int p, i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x", &p)
+ || unformat (input, "%d", &p))
+ {
+ if (p >= (1 << 8))
+ return 0;
+ *result = p;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ pm->protocol_info_by_name, &i))
+ {
+ osi_protocol_info_t * pi = vec_elt_at_index (pm->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_osi_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ osi_header_t _h, * h = &_h;
+ u8 p;
+
+ if (! unformat (input, "%U", unformat_osi_protocol, &p))
+ return 0;
+
+ h->protocol = p;
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static void add_protocol (osi_main_t * pm,
+ osi_protocol_t protocol,
+ char * protocol_name)
+{
+ osi_protocol_info_t * pi;
+ u32 i;
+
+ vec_add2 (pm->protocol_infos, pi, 1);
+ i = pi - pm->protocol_infos;
+
+ pi->name = protocol_name;
+ pi->protocol = protocol;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (pm->protocol_info_by_protocol, protocol, i);
+ hash_set_mem (pm->protocol_info_by_name, pi->name, i);
+}
+
+static clib_error_t * osi_init (vlib_main_t * vm)
+{
+ osi_main_t * pm = &osi_main;
+
+ memset (pm, 0, sizeof (pm[0]));
+ pm->vlib_main = vm;
+
+ pm->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ pm->protocol_info_by_protocol = hash_create (0, sizeof (uword));
+
+#define _(f,n) add_protocol (pm, OSI_PROTOCOL_##f, #f);
+ foreach_osi_protocol;
+#undef _
+
+ return vlib_call_init_function (vm, osi_input_init);
+}
+
+VLIB_INIT_FUNCTION (osi_init);
+
diff --git a/vnet/vnet/osi/osi.h b/vnet/vnet/osi/osi.h
new file mode 100644
index 00000000000..e213b1ba327
--- /dev/null
+++ b/vnet/vnet/osi/osi.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * osi.h: OSI definitions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_osi_h
+#define included_osi_h
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+
+#define foreach_osi_protocol \
+ _ (null, 0x0) \
+ _ (x_29, 0x01) \
+ _ (x_633, 0x03) \
+ _ (q_931, 0x08) \
+ _ (q_933, 0x08) \
+ _ (q_2931, 0x09) \
+ _ (q_2119, 0x0c) \
+ _ (snap, 0x80) \
+ _ (clnp, 0x81) \
+ _ (esis, 0x82) \
+ _ (isis, 0x83) \
+ _ (idrp, 0x85) \
+ _ (x25_esis, 0x8a) \
+ _ (iso10030, 0x8c) \
+ _ (iso11577, 0x8d) \
+ _ (ip6, 0x8e) \
+ _ (compressed, 0xb0) \
+ _ (sndcf, 0xc1) \
+ _ (ip4, 0xcc) \
+ _ (ppp, 0xcf)
+
+typedef enum {
+#define _(f,n) OSI_PROTOCOL_##f = n,
+ foreach_osi_protocol
+#undef _
+} osi_protocol_t;
+
+typedef struct {
+ u8 protocol;
+
+ u8 payload[0];
+} osi_header_t;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* OSI protocol (SAP type). */
+ osi_protocol_t protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} osi_protocol_info_t;
+
+#define foreach_osi_error \
+ _ (NONE, "no error") \
+ _ (UNKNOWN_PROTOCOL, "unknown osi protocol")
+
+typedef enum {
+#define _(f,s) OSI_ERROR_##f,
+ foreach_osi_error
+#undef _
+ OSI_N_ERROR,
+} osi_error_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ osi_protocol_info_t * protocol_infos;
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * protocol_info_by_name, * protocol_info_by_protocol;
+
+ /* osi-input next index indexed by protocol. */
+ u8 input_next_by_protocol[256];
+} osi_main_t;
+
+always_inline osi_protocol_info_t *
+osi_get_protocol_info (osi_main_t * m, osi_protocol_t protocol)
+{
+ uword * p = hash_get (m->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (m->protocol_infos, p[0]) : 0;
+}
+
+extern osi_main_t osi_main;
+
+/* Register given node index to take input for given osi type. */
+void
+osi_register_input_protocol (osi_protocol_t protocol,
+ u32 node_index);
+
+void osi_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ osi_protocol_t protocol);
+
+format_function_t format_osi_protocol;
+format_function_t format_osi_header;
+format_function_t format_osi_header_with_length;
+
+/* Parse osi protocol as 0xXXXX or protocol name. */
+unformat_function_t unformat_osi_protocol;
+
+/* Parse osi header. */
+unformat_function_t unformat_osi_header;
+unformat_function_t unformat_pg_osi_header;
+
+always_inline void
+osi_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_osi_header_with_length;
+ n->unformat_buffer = unformat_osi_header;
+ pn->unformat_edit = unformat_pg_osi_header;
+}
+
+void
+osi_register_input_protocol (osi_protocol_t protocol,
+ u32 node_index);
+
+format_function_t format_osi_header;
+
+#endif /* included_osi_h */
diff --git a/vnet/vnet/osi/pg.c b/vnet/vnet/osi/pg.c
new file mode 100644
index 00000000000..c3eb1a6ed66
--- /dev/null
+++ b/vnet/vnet/osi/pg.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * osi_pg.c: packet generator osi interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/osi/osi.h>
+
+typedef struct {
+ pg_edit_t protocol;
+} pg_osi_header_t;
+
+static inline void
+pg_osi_header_init (pg_osi_header_t * e)
+{
+ pg_edit_init (&e->protocol, osi_header_t, protocol);
+}
+
+uword
+unformat_pg_osi_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_osi_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (osi_header_t),
+ &group_index);
+ pg_osi_header_init (h);
+
+ error = 1;
+ if (! unformat (input, "%U",
+ unformat_pg_edit,
+ unformat_osi_protocol, &h->protocol))
+ goto done;
+
+ {
+ osi_main_t * pm = &osi_main;
+ osi_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->protocol.type == PG_EDIT_FIXED)
+ {
+ u8 t = *h->protocol.values[PG_EDIT_LO];
+ pi = osi_get_protocol_info (pm, t);
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/pg/cli.c b/vnet/vnet/pg/cli.c
new file mode 100644
index 00000000000..495eac2d0b1
--- /dev/null
+++ b/vnet/vnet/pg/cli.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_cli.c: packet generator cli
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+
+#ifdef CLIB_UNIX
+#include <vnet/unix/pcap.h>
+#endif
+
+/* Root of all packet generator cli commands. */
+VLIB_CLI_COMMAND (vlib_cli_pg_command, static) = {
+ .path = "packet-generator",
+ .short_help = "Packet generator commands",
+};
+
+static clib_error_t *
+enable_disable_stream (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ pg_main_t * pg = &pg_main;
+ pg_stream_t * s;
+ int is_enable = cmd->function_arg != 0;
+ u32 stream_index = ~0;
+
+ if (unformat (input, "%U", unformat_eof))
+ ;
+ else if (unformat (input, "%U", unformat_hash_vec_string,
+ pg->stream_index_by_name, &stream_index))
+ ;
+ else
+ return clib_error_create ("unknown input `%U'",
+ format_unformat_error, input);
+
+ /* No stream specified: enable/disable all streams. */
+ if (stream_index == ~0)
+ pool_foreach (s, pg->streams, ({
+ pg_stream_enable_disable (pg, s, is_enable);
+ }));
+ else
+ {
+ /* enable/disable specified stream. */
+ s = pool_elt_at_index (pg->streams, stream_index);
+ pg_stream_enable_disable (pg, s, is_enable);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (enable_streams_cli, static) = {
+ .path = "packet-generator enable-stream",
+ .short_help = "Enable packet generator streams",
+ .function = enable_disable_stream,
+ .function_arg = 1, /* is_enable */
+};
+
+VLIB_CLI_COMMAND (disable_streams_cli, static) = {
+ .path = "packet-generator disable-stream",
+ .short_help = "Disable packet generator streams",
+ .function = enable_disable_stream,
+ .function_arg = 0, /* is_enable */
+};
+
+static u8 * format_pg_stream (u8 * s, va_list * va)
+{
+ pg_stream_t * t = va_arg (*va, pg_stream_t *);
+ u8 * v;
+
+ if (! t)
+ return format (s, "%=16s%=12s%=16s%s",
+ "Name", "Enabled", "Count", "Parameters");
+
+ s = format (s, "%-16v%=12s%16Ld",
+ t->name,
+ pg_stream_is_enabled (t) ? "Yes" : "No",
+ t->n_packets_generated);
+
+ v = 0;
+
+ v = format (v, "limit %Ld, ", t->n_packets_limit);
+
+ v = format (v, "rate %.2e pps, ", t->rate_packets_per_second);
+
+ v = format (v, "size %d%c%d, ",
+ t->min_packet_bytes,
+ t->packet_size_edit_type == PG_EDIT_RANDOM ? '+' : '-',
+ t->max_packet_bytes);
+
+ v = format (v, "buffer-size %d, ", t->buffer_bytes);
+
+ if (v)
+ {
+ s = format (s, " %v", v);
+ vec_free (v);
+ }
+
+ return s;
+}
+
+static clib_error_t *
+show_streams (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ pg_main_t * pg = &pg_main;
+ pg_stream_t * s;
+
+ if (pool_elts (pg->streams) == 0)
+ {
+ vlib_cli_output (vm, "no streams currently defined");
+ goto done;
+ }
+
+ vlib_cli_output (vm, "%U", format_pg_stream, 0);
+ pool_foreach (s, pg->streams, ({
+ vlib_cli_output (vm, "%U", format_pg_stream, s);
+ }));
+
+ done:
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_streams_cli, static) = {
+ .path = "show packet-generator",
+ .short_help = "Show packet generator streams",
+ .function = show_streams,
+};
+
+static clib_error_t *
+pg_pcap_read (pg_stream_t * s, char * file_name)
+{
+#ifndef CLIB_UNIX
+ return clib_error_return (0, "no pcap support");
+#else
+ pcap_main_t pm;
+ clib_error_t * error;
+ memset (&pm, 0, sizeof (pm));
+ pm.file_name = file_name;
+ error = pcap_read (&pm);
+ s->replay_packet_templates = pm.packets_read;
+ s->min_packet_bytes = pm.min_packet_bytes;
+ s->max_packet_bytes = pm.max_packet_bytes;
+ s->buffer_bytes = pm.max_packet_bytes;
+ /* For PCAP buffers we never re-use buffers. */
+ s->flags |= PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE;
+ return error;
+#endif /* CLIB_UNIX */
+}
+
+static uword
+unformat_pg_stream_parameter (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ f64 x;
+
+ if (unformat (input, "limit %f", &x))
+ s->n_packets_limit = x;
+
+ else if (unformat (input, "rate %f", &x))
+ s->rate_packets_per_second = x;
+
+ else if (unformat (input, "size %d-%d", &s->min_packet_bytes,
+ &s->max_packet_bytes))
+ s->packet_size_edit_type = PG_EDIT_INCREMENT;
+
+ else if (unformat (input, "size %d+%d", &s->min_packet_bytes,
+ &s->max_packet_bytes))
+ s->packet_size_edit_type = PG_EDIT_RANDOM;
+
+ else if (unformat (input, "buffer-size %d", &s->buffer_bytes))
+ ;
+
+ else
+ return 0;
+
+ return 1;
+}
+
+static clib_error_t *
+validate_stream (pg_stream_t * s)
+{
+ if (s->max_packet_bytes < s->min_packet_bytes)
+ return clib_error_create ("max-size < min-size");
+
+ if (s->buffer_bytes >= 4096 || s->buffer_bytes == 0)
+ return clib_error_create ("buffer-size must be positive and < 4096, given %d",
+ s->buffer_bytes);
+
+ if (s->rate_packets_per_second < 0)
+ return clib_error_create ("negative rate");
+
+ return 0;
+}
+
+static clib_error_t *
+new_stream (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ clib_error_t * error = 0;
+ u8 * tmp = 0;
+ u32 hw_if_index;
+ unformat_input_t sub_input = {0};
+ int sub_input_given = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ pg_main_t * pg = &pg_main;
+ pg_stream_t s = {0};
+ char * pcap_file_name;
+
+ s.sw_if_index[VLIB_RX] = s.sw_if_index[VLIB_TX] = ~0;
+ s.node_index = ~0;
+ s.max_packet_bytes = s.min_packet_bytes = 64;
+ s.buffer_bytes = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+ pcap_file_name = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "name %v", &tmp))
+ {
+ if (s.name)
+ vec_free (s.name);
+ s.name = tmp;
+ }
+
+ else if (unformat (input, "node %U",
+ unformat_vnet_hw_interface, vnm, &hw_if_index))
+ {
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ s.node_index = hi->output_node_index;
+ s.sw_if_index[VLIB_TX] = hi->sw_if_index;
+ }
+
+ else if (unformat (input, "node %U",
+ unformat_vlib_node, vm, &s.node_index))
+ ;
+
+ else if (unformat (input, "interface %U",
+ unformat_vnet_sw_interface, vnm, &s.sw_if_index[VLIB_RX]))
+ ;
+
+ else if (unformat (input, "pcap %s", &pcap_file_name))
+ ;
+
+ else if (! sub_input_given
+ && unformat (input, "data %U", unformat_input, &sub_input))
+ sub_input_given++;
+
+ else if (unformat_user (input, unformat_pg_stream_parameter, &s))
+ ;
+
+ else if (unformat (input, "no-recycle"))
+ s.flags |= PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE;
+
+ else
+ {
+ error = clib_error_create ("unknown input `%U'",
+ format_unformat_error, input);
+ goto done;
+ }
+ }
+
+ error = validate_stream (&s);
+ if (error)
+ return error;
+
+ if (! sub_input_given && ! pcap_file_name)
+ {
+ error = clib_error_create ("no packet data given");
+ goto done;
+ }
+
+ if (s.node_index == ~0)
+ {
+ error = clib_error_create ("output interface or node not given");
+ goto done;
+ }
+
+ {
+ pg_node_t * n;
+
+ if (s.node_index < vec_len (pg->nodes))
+ n = pg->nodes + s.node_index;
+ else
+ n = 0;
+
+ if (pcap_file_name != 0)
+ {
+ error = pg_pcap_read (&s, pcap_file_name);
+ if (error)
+ goto done;
+ vec_free (pcap_file_name);
+ }
+
+ else if (n && n->unformat_edit
+ && unformat_user (&sub_input, n->unformat_edit, &s))
+ ;
+
+ else if (! unformat_user (&sub_input, unformat_pg_payload, &s))
+ {
+ error = clib_error_create
+ ("failed to parse packet data from `%U'",
+ format_unformat_error, &sub_input);
+ goto done;
+ }
+ }
+
+ pg_stream_add (pg, &s);
+ return 0;
+
+ done:
+ pg_stream_free (&s);
+ unformat_free (&sub_input);
+ return error;
+}
+
+VLIB_CLI_COMMAND (new_stream_cli, static) = {
+ .path = "packet-generator new",
+ .function = new_stream,
+ .short_help = "Create packet generator stream",
+ .long_help =
+ "Create packet generator stream\n"
+ "\n"
+ "Arguments:\n"
+ "\n"
+ "name STRING sets stream name\n"
+ "interface STRING interface for stream output \n"
+ "node NODE-NAME node for stream output\n"
+ "data STRING specifies packet data\n",
+};
+
+static clib_error_t *
+del_stream (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ pg_main_t * pg = &pg_main;
+ u32 i;
+
+ if (! unformat (input, "%U",
+ &unformat_hash_vec_string, pg->stream_index_by_name, &i))
+ return clib_error_create ("expected stream name `%U'",
+ format_unformat_error, input);
+
+ pg_stream_del (pg, i);
+ return 0;
+}
+
+VLIB_CLI_COMMAND (del_stream_cli, static) = {
+ .path = "packet-generator delete",
+ .function = del_stream,
+ .short_help = "Delete stream with given name",
+};
+
+static clib_error_t *
+change_stream_parameters (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ pg_main_t * pg = &pg_main;
+ pg_stream_t * s, s_new;
+ u32 stream_index = ~0;
+ clib_error_t * error;
+
+ if (unformat (input, "%U", unformat_hash_vec_string,
+ pg->stream_index_by_name, &stream_index))
+ ;
+ else
+ return clib_error_create ("expecting stream name; got `%U'",
+ format_unformat_error, input);
+
+ s = pool_elt_at_index (pg->streams, stream_index);
+ s_new = s[0];
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat_user (input, unformat_pg_stream_parameter, &s_new))
+ ;
+
+ else
+ return clib_error_create ("unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ error = validate_stream (&s_new);
+ if (! error)
+ s[0] = s_new;
+
+ return error;
+}
+
+VLIB_CLI_COMMAND (change_stream_parameters_cli, static) = {
+ .path = "packet-generator configure",
+ .short_help = "Change packet generator stream parameters",
+ .function = change_stream_parameters,
+};
+
+/* Dummy init function so that we can be linked in. */
+static clib_error_t * pg_cli_init (vlib_main_t * vm)
+{ return 0; }
+
+VLIB_INIT_FUNCTION (pg_cli_init);
diff --git a/vnet/vnet/pg/edit.c b/vnet/vnet/pg/edit.c
new file mode 100644
index 00000000000..9c49cef8903
--- /dev/null
+++ b/vnet/vnet/pg/edit.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_edit.c: packet generator edits
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+
+static void
+pg_edit_set_value_helper (pg_edit_t * e, u64 value, u8 * result)
+{
+ int i, j, n_bits_left;
+ u8 * v, tmp[8];
+
+ v = tmp;
+
+ n_bits_left = e->n_bits;
+ i = 0;
+ j = e->lsb_bit_offset % BITS (v[0]);
+
+ if (n_bits_left > 0 && j != 0)
+ {
+ v[i] = (value & 0xff) << j;
+ value >>= BITS (v[0]) - j;
+ n_bits_left -= BITS (v[0]) - j;
+ i += 1;
+ }
+
+ while (n_bits_left > 0)
+ {
+ v[i] = value & 0xff;
+ value >>= 8;
+ n_bits_left -= 8;
+ i += 1;
+ }
+
+ /* Convert to network byte order. */
+ for (j = 0; j < i; j++)
+ result[j] = v[i - 1 - j];
+}
+
+void
+pg_edit_set_value (pg_edit_t * e, int hi_or_lo, u64 value)
+{
+ pg_edit_alloc_value (e, hi_or_lo);
+ pg_edit_set_value_helper (e, value, e->values[hi_or_lo]);
+}
+
+/* Parse an int either %d or 0x%x into network byte order. */
+uword unformat_pg_number (unformat_input_t * input, va_list * args)
+{
+ u8 * result = va_arg (*args, u8 *);
+ pg_edit_t * e = va_arg (*args, pg_edit_t *);
+ u64 value;
+
+ ASSERT (BITS (value) >= e->n_bits);
+
+ if (! unformat (input, "0x%X", sizeof (value), &value)
+ && ! unformat (input, "%D", sizeof (value), &value))
+ return 0;
+
+ /* Number given does not fit into bit field. */
+ if (e->n_bits < 64
+ && value >= (u64) 1 << (u64) e->n_bits)
+ return 0;
+
+ pg_edit_set_value_helper (e, value, result);
+ return 1;
+}
+
+uword
+unformat_pg_edit (unformat_input_t * input, va_list * args)
+{
+ unformat_function_t * f = va_arg (*args, unformat_function_t *);
+ pg_edit_t * e = va_arg (*args, pg_edit_t *);
+
+ pg_edit_alloc_value (e, PG_EDIT_LO);
+ if (! unformat_user (input, f, e->values[PG_EDIT_LO], e))
+ return 0;
+
+ pg_edit_alloc_value (e, PG_EDIT_HI);
+ if (unformat (input, "-%U", f, e->values[PG_EDIT_HI], e))
+ e->type = PG_EDIT_INCREMENT;
+ else if (unformat (input, "+%U", f, e->values[PG_EDIT_HI], e))
+ e->type = PG_EDIT_RANDOM;
+ else
+ e->type = PG_EDIT_FIXED;
+
+ return 1;
+}
+
+uword
+unformat_pg_payload (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_main_t * pg = &pg_main;
+ vlib_main_t * vm = pg->vlib_main;
+ pg_edit_t * e;
+ u32 i, node_index, len, max_len;
+ u8 * v;
+
+ v = 0;
+
+ if (unformat (input, "incrementing %d", &len))
+ {
+ vec_resize (v, len);
+ for (i = 0; i < len; i++)
+ v[i] = i;
+ }
+ else if (unformat (input, "hex 0x%U", unformat_hex_string, &v))
+ ;
+
+ else if (unformat (input, "%U", unformat_vlib_node, vm, &node_index))
+ {
+ pg_node_t * pn = pg_get_node (node_index);
+ if (! pn->unformat_edit)
+ return 0;
+ return unformat (input, "%U", pn->unformat_edit, s);
+ }
+
+ else
+ return 0;
+
+ /* Length not including this payload. */
+ max_len = pg_edit_group_n_bytes (s, 0);
+ if (max_len + vec_len (v) >= s->max_packet_bytes)
+ {
+ if (s->max_packet_bytes >= max_len)
+ _vec_len (v) = s->max_packet_bytes - max_len;
+ else
+ _vec_len (v) = 0;
+ }
+
+ e = pg_create_edit_group (s, sizeof (e[0]), vec_len (v), 0);
+
+ e->type = PG_EDIT_FIXED;
+ e->n_bits = vec_len (v) * BITS (v[0]);
+
+ /* Least significant bit is at end of bitstream, since everything is always bigendian. */
+ e->lsb_bit_offset = e->n_bits - BITS (v[0]);
+
+ e->values[PG_EDIT_LO] = v;
+
+ return 1;
+}
diff --git a/vnet/vnet/pg/edit.h b/vnet/vnet/pg/edit.h
new file mode 100644
index 00000000000..07c14a2a203
--- /dev/null
+++ b/vnet/vnet/pg/edit.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_edit.h: packet generator edits
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_packet_generator_pg_edit_h
+#define included_packet_generator_pg_edit_h
+
+#include <vppinfra/format.h>
+#include <vppinfra/vec.h>
+
+typedef enum {
+ /* Invalid type used to poison edits. */
+ PG_EDIT_INVALID_TYPE,
+
+ /* Value is fixed: does not change for all packets in sequence. */
+ PG_EDIT_FIXED,
+
+ /* Value v increments between low and high values v_low <= v <= v_high. */
+ PG_EDIT_INCREMENT,
+
+ /* Random value between low and high values v_low <= v <= v_high. */
+ PG_EDIT_RANDOM,
+
+ /* Unspecified value; will be specified by some edit function. */
+ PG_EDIT_UNSPECIFIED,
+} pg_edit_type_t;
+
+typedef struct {
+ pg_edit_type_t type;
+
+ /* Bit offset within packet where value is to be written.
+ Bits are written in network byte order: high bits first.
+ This is the bit offset of the least significant bit: i.e. the
+ highest numbered byte * 8 plus bit offset within that byte.
+ Negative offsets encode special edits. */
+ i32 lsb_bit_offset;
+
+ /* Special offset indicating this edit is for packet length. */
+#define PG_EDIT_PACKET_LENGTH (-1)
+
+ /* Number of bits in edit. */
+ u32 n_bits;
+
+ /* Low and high values for this edit. Network byte order. */
+ u8 * values[2];
+#define PG_EDIT_LO 0
+#define PG_EDIT_HI 1
+
+ /* Last value used for increment edit type. */
+ u64 last_increment_value;
+} pg_edit_t;
+
+always_inline void
+pg_edit_free (pg_edit_t * e)
+{
+ int i;
+ for (i = 0; i < ARRAY_LEN (e->values); i++)
+ vec_free (e->values[i]);
+}
+
+#define pg_edit_init_bitfield(e,type,field,field_offset,field_n_bits) \
+do { \
+ u32 _bo; \
+ \
+ ASSERT ((field_offset) < STRUCT_BITS_OF (type, field)); \
+ \
+ /* Start byte offset. */ \
+ _bo = STRUCT_OFFSET_OF (type, field); \
+ \
+ /* Adjust for big endian byte order. */ \
+ _bo += ((STRUCT_BITS_OF (type, field) \
+ - (field_offset) - 1) / BITS (u8)); \
+ \
+ (e)->lsb_bit_offset = _bo * BITS (u8) + ((field_offset) % BITS (u8)); \
+ (e)->n_bits = (field_n_bits); \
+} while (0)
+
+/* Initialize edit for byte aligned fields. */
+#define pg_edit_init(e,type,field) \
+ pg_edit_init_bitfield(e,type,field,0,STRUCT_BITS_OF(type,field))
+
+static inline uword
+pg_edit_n_alloc_bytes (pg_edit_t * e)
+{
+ int i0, i1, n_bytes, n_bits_left;
+
+ i0 = e->lsb_bit_offset;
+ i1 = i0 % BITS (u8);
+
+ n_bytes = 0;
+ n_bits_left = e->n_bits;
+
+ if (n_bits_left > 0 && i1 != 0)
+ {
+ n_bytes++;
+ n_bits_left -= i1;
+ if (n_bits_left < 0)
+ n_bits_left = 0;
+ }
+
+ n_bytes += (n_bits_left / BITS (u8));
+ n_bytes += (n_bits_left % BITS (u8)) != 0;
+
+ return n_bytes;
+}
+
+static inline void
+pg_edit_alloc_value (pg_edit_t * e, int i)
+{ vec_validate (e->values[i], e->lsb_bit_offset / BITS (u8)); }
+
+extern void pg_edit_set_value (pg_edit_t * e, int hi_or_lo, u64 value);
+
+static inline void
+pg_edit_set_fixed (pg_edit_t * e, u64 value)
+{
+ e->type = PG_EDIT_FIXED;
+ pg_edit_set_value (e, PG_EDIT_LO, value);
+}
+
+static inline void
+pg_edit_copy_type_and_values (pg_edit_t * dst, pg_edit_t * src)
+{
+ int i;
+ dst->type = src->type;
+ src->type = PG_EDIT_INVALID_TYPE;
+ for (i = 0; i < ARRAY_LEN (dst->values); i++)
+ {
+ dst->values[i] = src->values[i];
+ src->values[i] = 0;
+ }
+}
+
+static inline u64
+pg_edit_get_value (pg_edit_t * e, int hi_or_lo)
+{
+ u64 r = 0;
+ int i, n;
+ u8 * v = e->values[hi_or_lo];
+
+ n = round_pow2 (e->n_bits, BITS (u8)) / BITS (u8);
+
+ ASSERT (n <= vec_len (v));
+ ASSERT (n <= sizeof (r));
+
+ for (i = 0; i < n; i++)
+ r = (r << BITS (v[i])) + v[i];
+
+ return r;
+}
+
+static inline uword
+pg_edit_is_fixed_with_value (pg_edit_t * e, u64 value)
+{
+ return (e->type == PG_EDIT_FIXED
+ && value == pg_edit_get_value (e, PG_EDIT_LO));
+}
+
+uword unformat_pg_edit (unformat_input_t * input, va_list * args);
+uword unformat_pg_payload (unformat_input_t * input, va_list * args);
+uword unformat_pg_number (unformat_input_t * input, va_list * args);
+uword unformat_pg_interface (unformat_input_t * input, va_list * args);
+
+#endif /* included_packet_generator_pg_edit_h */
diff --git a/vnet/vnet/pg/example.script b/vnet/vnet/pg/example.script
new file mode 100644
index 00000000000..0e29b9ecae6
--- /dev/null
+++ b/vnet/vnet/pg/example.script
@@ -0,0 +1,6 @@
+packet-generator new {
+ name x
+ limit 1
+ node ethernet-input
+ data { IP: 1.2.3 -> 4.5.6 incrementing 100 }
+}
diff --git a/vnet/vnet/pg/init.c b/vnet/vnet/pg/init.c
new file mode 100644
index 00000000000..f598050c28d
--- /dev/null
+++ b/vnet/vnet/pg/init.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_init.c: VLIB packet generator
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+
+/* Global main structure. */
+pg_main_t pg_main;
+
+static clib_error_t * pg_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ pg_main_t * pg = &pg_main;
+ int i, j;
+
+ pg->vlib_main = vm;
+
+ if ((error = vlib_call_init_function (vm, vnet_main_init)))
+ goto done;
+
+ if ((error = vlib_call_init_function (vm, pg_cli_init)))
+ goto done;
+
+ /* Create/free interfaces so that they exist and can be
+ used as a destination interface for streams. Also, create
+ a fixed number of pg interfaces so that interface numbering can
+ be made to be deterministic (at least if <= 4 streams are ever used). */
+ for (i = 0; i < 4; i++)
+ {
+ j = pg_interface_find_free (pg, i);
+ ASSERT (j == i);
+ }
+
+ /* Free interfaces. */
+ for (i = j; i >= 0; i--)
+ vec_add1 (pg->free_interfaces, i);
+
+ done:
+ return error;
+}
+
+VLIB_INIT_FUNCTION (pg_init);
diff --git a/vnet/vnet/pg/input.c b/vnet/vnet/pg/input.c
new file mode 100644
index 00000000000..4ec61ca7b96
--- /dev/null
+++ b/vnet/vnet/pg/input.c
@@ -0,0 +1,1745 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_input.c: buffer generator input
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/vnet.h>
+
+#if DPDK==1
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
+static inline void
+pg_set_mbuf_metadata (pg_main_t * pg, u32 * buffers, u32 n_alloc)
+{
+#if DPDK == 1
+ vlib_main_t * vm = vlib_get_main();
+ vlib_buffer_t * b;
+ struct rte_mbuf * mb;
+ i16 delta;
+ u16 new_data_len;
+ u16 new_pkt_len;
+
+ int i;
+
+ for (i = 0; i < n_alloc; i++)
+ {
+ b = vlib_get_buffer (vm, buffers[i]);
+ mb = ((struct rte_mbuf *)b) - 1;
+
+ delta = vlib_buffer_length_in_chain (vm, b) - (i16) mb->pkt_len;
+ new_data_len = (u16)((i16) mb->data_len + delta);
+ new_pkt_len = (u16)((i16) mb->pkt_len + delta);
+
+ mb->data_len = new_data_len;
+ mb->pkt_len = new_pkt_len;
+ mb->data_off = (u16)((RTE_PKTMBUF_HEADROOM) + b->current_data);
+ }
+#endif
+}
+
+static int
+validate_buffer_data2 (vlib_buffer_t * b, pg_stream_t * s,
+ u32 data_offset, u32 n_bytes)
+{
+ u8 * bd, * pd, * pm;
+ u32 i;
+
+ bd = b->data;
+ pd = s->fixed_packet_data + data_offset;
+ pm = s->fixed_packet_data_mask + data_offset;
+
+ if (pd + n_bytes >= vec_end (s->fixed_packet_data))
+ n_bytes = (pd < vec_end (s->fixed_packet_data)
+ ? vec_end (s->fixed_packet_data) - pd
+ : 0);
+
+ for (i = 0; i < n_bytes; i++)
+ if ((bd[i] & pm[i]) != pd[i])
+ break;
+
+ if (i >= n_bytes)
+ return 1;
+
+ clib_warning ("buffer %U", format_vlib_buffer, b);
+ clib_warning ("differ at index %d", i);
+ clib_warning ("is %U", format_hex_bytes, bd, n_bytes);
+ clib_warning ("mask %U", format_hex_bytes, pm, n_bytes);
+ clib_warning ("expect %U", format_hex_bytes, pd, n_bytes);
+ return 0;
+}
+
+static int
+validate_buffer_data (vlib_buffer_t * b, pg_stream_t * s)
+{ return validate_buffer_data2 (b, s, 0, s->buffer_bytes); }
+
+always_inline void
+set_1 (void * a0,
+ u64 v0,
+ u64 v_min, u64 v_max,
+ u32 n_bits,
+ u32 is_net_byte_order)
+{
+ ASSERT (v0 >= v_min && v0 <= v_max);
+ if (n_bits == BITS (u8))
+ {
+ ((u8 *) a0)[0] = v0;
+ }
+ else if (n_bits == BITS (u16))
+ {
+ if (is_net_byte_order)
+ v0 = clib_host_to_net_u16 (v0);
+ clib_mem_unaligned (a0, u16) = v0;
+ }
+ else if (n_bits == BITS (u32))
+ {
+ if (is_net_byte_order)
+ v0 = clib_host_to_net_u32 (v0);
+ clib_mem_unaligned (a0, u32) = v0;
+ }
+ else if (n_bits == BITS (u64))
+ {
+ if (is_net_byte_order)
+ v0 = clib_host_to_net_u64 (v0);
+ clib_mem_unaligned (a0, u64) = v0;
+ }
+}
+
+always_inline void
+set_2 (void * a0, void * a1,
+ u64 v0, u64 v1,
+ u64 v_min, u64 v_max,
+ u32 n_bits,
+ u32 is_net_byte_order,
+ u32 is_increment)
+{
+ ASSERT (v0 >= v_min && v0 <= v_max);
+ ASSERT (v1 >= v_min && v1 <= (v_max + is_increment));
+ if (n_bits == BITS (u8))
+ {
+ ((u8 *) a0)[0] = v0;
+ ((u8 *) a1)[0] = v1;
+ }
+ else if (n_bits == BITS (u16))
+ {
+ if (is_net_byte_order)
+ {
+ v0 = clib_host_to_net_u16 (v0);
+ v1 = clib_host_to_net_u16 (v1);
+ }
+ clib_mem_unaligned (a0, u16) = v0;
+ clib_mem_unaligned (a1, u16) = v1;
+ }
+ else if (n_bits == BITS (u32))
+ {
+ if (is_net_byte_order)
+ {
+ v0 = clib_host_to_net_u32 (v0);
+ v1 = clib_host_to_net_u32 (v1);
+ }
+ clib_mem_unaligned (a0, u32) = v0;
+ clib_mem_unaligned (a1, u32) = v1;
+ }
+ else if (n_bits == BITS (u64))
+ {
+ if (is_net_byte_order)
+ {
+ v0 = clib_host_to_net_u64 (v0);
+ v1 = clib_host_to_net_u64 (v1);
+ }
+ clib_mem_unaligned (a0, u64) = v0;
+ clib_mem_unaligned (a1, u64) = v1;
+ }
+}
+
+static_always_inline void
+do_set_fixed (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 n_bits,
+ u32 byte_offset,
+ u32 is_net_byte_order,
+ u64 v_min, u64 v_max)
+
+{
+ vlib_main_t * vm = pg->vlib_main;
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ set_2 (a0, a1, v_min, v_min,
+ v_min, v_max,
+ n_bits, is_net_byte_order,
+ /* is_increment */ 0);
+
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ set_1 (a0, v_min,
+ v_min, v_max,
+ n_bits, is_net_byte_order);
+
+ ASSERT (validate_buffer_data (b0, s));
+ }
+}
+
+static_always_inline u64
+do_set_increment (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 n_bits,
+ u32 byte_offset,
+ u32 is_net_byte_order,
+ u32 want_sum,
+ u64 * sum_result,
+ u64 v_min, u64 v_max,
+ u64 v)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u64 sum = 0;
+
+ ASSERT (v >= v_min && v <= v_max);
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+ u64 v_old;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ v_old = v;
+ v = v_old + 2;
+ v = v > v_max ? v_min : v;
+ set_2 (a0, a1,
+ v_old + 0, v_old + 1,
+ v_min, v_max,
+ n_bits, is_net_byte_order,
+ /* is_increment */ 1);
+
+ if (want_sum)
+ sum += 2*v_old + 1;
+
+ if (PREDICT_FALSE (v_old + 1 > v_max))
+ {
+ if (want_sum)
+ sum -= 2*v_old + 1;
+
+ v = v_old;
+ set_1 (a0, v + 0, v_min, v_max, n_bits, is_net_byte_order);
+ if (want_sum)
+ sum += v;
+ v += 1;
+
+ v = v > v_max ? v_min : v;
+ set_1 (a1, v + 0, v_min, v_max, n_bits, is_net_byte_order);
+ if (want_sum)
+ sum += v;
+ v += 1;
+ }
+
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+ u64 v_old;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ v_old = v;
+ if (want_sum)
+ sum += v_old;
+ v += 1;
+ v = v > v_max ? v_min : v;
+
+ ASSERT (v_old >= v_min && v_old <= v_max);
+ set_1 (a0, v_old, v_min, v_max, n_bits, is_net_byte_order);
+
+ ASSERT (validate_buffer_data (b0, s));
+ }
+
+ if (want_sum)
+ *sum_result = sum;
+
+ return v;
+}
+
+static_always_inline void
+do_set_random (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 n_bits,
+ u32 byte_offset,
+ u32 is_net_byte_order,
+ u32 want_sum,
+ u64 * sum_result,
+ u64 v_min, u64 v_max)
+
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u64 v_diff = v_max - v_min + 1;
+ u64 r_mask = max_pow2 (v_diff) - 1;
+ u64 v0, v1;
+ u64 sum = 0;
+ void * random_data;
+
+ random_data = clib_random_buffer_get_data
+ (&vm->random_buffer, n_buffers * n_bits / BITS (u8));
+
+ v0 = v1 = v_min;
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+ u64 r0=0, r1=0; /* warnings be gone */
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ switch (n_bits)
+ {
+#define _(n) \
+ case BITS (u##n): \
+ { \
+ u##n * r = random_data; \
+ r0 = r[0]; \
+ r1 = r[1]; \
+ random_data = r + 2; \
+ } \
+ break;
+
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+
+#undef _
+ }
+
+ /* Add power of 2 sized random number which may be out of range. */
+ v0 += r0 & r_mask;
+ v1 += r1 & r_mask;
+
+ /* Twice should be enough to reduce to v_min .. v_max range. */
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v1 = v1 > v_max ? v1 - v_diff : v1;
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v1 = v1 > v_max ? v1 - v_diff : v1;
+
+ if (want_sum)
+ sum += v0 + v1;
+
+ set_2 (a0, a1,
+ v0, v1,
+ v_min, v_max,
+ n_bits, is_net_byte_order,
+ /* is_increment */ 0);
+
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+ u64 r0 = 0; /* warnings be gone */
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ switch (n_bits)
+ {
+#define _(n) \
+ case BITS (u##n): \
+ { \
+ u##n * r = random_data; \
+ r0 = r[0]; \
+ random_data = r + 1; \
+ } \
+ break;
+
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+
+#undef _
+ }
+
+ /* Add power of 2 sized random number which may be out of range. */
+ v0 += r0 & r_mask;
+
+ /* Twice should be enough to reduce to v_min .. v_max range. */
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+
+ if (want_sum)
+ sum += v0;
+
+ set_1 (a0, v0, v_min, v_max, n_bits, is_net_byte_order);
+
+ ASSERT (validate_buffer_data (b0, s));
+ }
+
+ if (want_sum)
+ *sum_result = sum;
+}
+
+#define _(i,t) \
+ clib_mem_unaligned (a##i, t) = \
+ clib_host_to_net_##t ((clib_net_to_host_mem_##t (a##i) &~ mask) \
+ | (v##i << shift))
+
+always_inline void
+setbits_1 (void * a0,
+ u64 v0,
+ u64 v_min, u64 v_max,
+ u32 max_bits,
+ u32 n_bits,
+ u64 mask,
+ u32 shift)
+{
+ ASSERT (v0 >= v_min && v0 <= v_max);
+ if (max_bits == BITS (u8))
+ ((u8 *) a0)[0] = (((u8 *) a0)[0] &~ mask) | (v0 << shift);
+
+ else if (max_bits == BITS (u16))
+ {
+ _ (0, u16);
+ }
+ else if (max_bits == BITS (u32))
+ {
+ _ (0, u32);
+ }
+ else if (max_bits == BITS (u64))
+ {
+ _ (0, u64);
+ }
+}
+
+always_inline void
+setbits_2 (void * a0, void * a1,
+ u64 v0, u64 v1,
+ u64 v_min, u64 v_max,
+ u32 max_bits,
+ u32 n_bits,
+ u64 mask,
+ u32 shift,
+ u32 is_increment)
+{
+ ASSERT (v0 >= v_min && v0 <= v_max);
+ ASSERT (v1 >= v_min && v1 <= v_max + is_increment);
+ if (max_bits == BITS (u8))
+ {
+ ((u8 *) a0)[0] = (((u8 *) a0)[0] &~ mask) | (v0 << shift);
+ ((u8 *) a1)[0] = (((u8 *) a1)[0] &~ mask) | (v1 << shift);
+ }
+
+ else if (max_bits == BITS (u16))
+ {
+ _ (0, u16);
+ _ (1, u16);
+ }
+ else if (max_bits == BITS (u32))
+ {
+ _ (0, u32);
+ _ (1, u32);
+ }
+ else if (max_bits == BITS (u64))
+ {
+ _ (0, u64);
+ _ (1, u64);
+ }
+}
+
+#undef _
+
+static_always_inline void
+do_setbits_fixed (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 max_bits,
+ u32 n_bits,
+ u32 byte_offset,
+ u64 v_min, u64 v_max,
+ u64 mask,
+ u32 shift)
+
+{
+ vlib_main_t * vm = pg->vlib_main;
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ setbits_2 (a0, a1,
+ v_min, v_min,
+ v_min, v_max,
+ max_bits, n_bits, mask, shift,
+ /* is_increment */ 0);
+
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ setbits_1 (a0, v_min, v_min, v_max, max_bits, n_bits, mask, shift);
+ ASSERT (validate_buffer_data (b0, s));
+ }
+}
+
+static_always_inline u64
+do_setbits_increment (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 max_bits,
+ u32 n_bits,
+ u32 byte_offset,
+ u64 v_min, u64 v_max,
+ u64 v,
+ u64 mask,
+ u32 shift)
+{
+ vlib_main_t * vm = pg->vlib_main;
+
+ ASSERT (v >= v_min && v <= v_max);
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+ u64 v_old;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ v_old = v;
+ v = v_old + 2;
+ v = v > v_max ? v_min : v;
+ setbits_2 (a0, a1,
+ v_old + 0, v_old + 1,
+ v_min, v_max,
+ max_bits, n_bits, mask, shift,
+ /* is_increment */ 1);
+
+ if (PREDICT_FALSE (v_old + 1 > v_max))
+ {
+ v = v_old;
+ setbits_1 (a0, v + 0, v_min, v_max, max_bits, n_bits, mask, shift);
+ v += 1;
+
+ v = v > v_max ? v_min : v;
+ setbits_1 (a1, v + 0, v_min, v_max, max_bits, n_bits, mask, shift);
+ v += 1;
+ }
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+ u64 v_old;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ v_old = v;
+ v = v_old + 1;
+ v = v > v_max ? v_min : v;
+
+ ASSERT (v_old >= v_min && v_old <= v_max);
+ setbits_1 (a0, v_old, v_min, v_max, max_bits, n_bits, mask, shift);
+
+ ASSERT (validate_buffer_data (b0, s));
+ }
+
+ return v;
+}
+
+static_always_inline void
+do_setbits_random (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 max_bits,
+ u32 n_bits,
+ u32 byte_offset,
+ u64 v_min, u64 v_max,
+ u64 mask,
+ u32 shift)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u64 v_diff = v_max - v_min + 1;
+ u64 r_mask = max_pow2 (v_diff) - 1;
+ u64 v0, v1;
+ void * random_data;
+
+ random_data = clib_random_buffer_get_data
+ (&vm->random_buffer, n_buffers * max_bits / BITS (u8));
+ v0 = v1 = v_min;
+
+ while (n_buffers >= 4)
+ {
+ vlib_buffer_t * b0, * b1, * b2, * b3;
+ void * a0, * a1;
+ u64 r0=0, r1=0; /* warnings be gone */
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ b2 = vlib_get_buffer (vm, buffers[2]);
+ b3 = vlib_get_buffer (vm, buffers[3]);
+ buffers += 2;
+ n_buffers -= 2;
+
+ a0 = (void *) b0 + byte_offset;
+ a1 = (void *) b1 + byte_offset;
+ CLIB_PREFETCH ((void *) b2 + byte_offset, sizeof (v_min), WRITE);
+ CLIB_PREFETCH ((void *) b3 + byte_offset, sizeof (v_min), WRITE);
+
+ switch (max_bits)
+ {
+#define _(n) \
+ case BITS (u##n): \
+ { \
+ u##n * r = random_data; \
+ r0 = r[0]; \
+ r1 = r[1]; \
+ random_data = r + 2; \
+ } \
+ break;
+
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+
+#undef _
+ }
+
+ /* Add power of 2 sized random number which may be out of range. */
+ v0 += r0 & r_mask;
+ v1 += r1 & r_mask;
+
+ /* Twice should be enough to reduce to v_min .. v_max range. */
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v1 = v1 > v_max ? v1 - v_diff : v1;
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v1 = v1 > v_max ? v1 - v_diff : v1;
+
+ setbits_2 (a0, a1,
+ v0, v1,
+ v_min, v_max,
+ max_bits, n_bits, mask, shift,
+ /* is_increment */ 0);
+
+ ASSERT (validate_buffer_data (b0, s));
+ ASSERT (validate_buffer_data (b1, s));
+ }
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b0;
+ void * a0;
+ u64 r0 = 0; /* warnings be gone */
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ buffers += 1;
+ n_buffers -= 1;
+
+ a0 = (void *) b0 + byte_offset;
+
+ switch (max_bits)
+ {
+#define _(n) \
+ case BITS (u##n): \
+ { \
+ u##n * r = random_data; \
+ r0 = r[0]; \
+ random_data = r + 1; \
+ } \
+ break;
+
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+
+#undef _
+ }
+
+ /* Add power of 2 sized random number which may be out of range. */
+ v0 += r0 & r_mask;
+
+ /* Twice should be enough to reduce to v_min .. v_max range. */
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+ v0 = v0 > v_max ? v0 - v_diff : v0;
+
+ setbits_1 (a0, v0, v_min, v_max, max_bits, n_bits, mask, shift);
+
+ ASSERT (validate_buffer_data (b0, s));
+ }
+}
+
+static u64 do_it (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 lo_bit, u32 hi_bit,
+ u64 v_min, u64 v_max,
+ u64 v,
+ pg_edit_type_t edit_type)
+{
+ u32 max_bits, l0, l1, h1, start_bit;
+
+ if (v_min == v_max)
+ edit_type = PG_EDIT_FIXED;
+
+ l0 = lo_bit / BITS (u8);
+ l1 = lo_bit % BITS (u8);
+ h1 = hi_bit % BITS (u8);
+
+ start_bit = l0 * BITS (u8);
+
+ max_bits = hi_bit - start_bit;
+ ASSERT (max_bits <= 64);
+
+#define _(n) \
+ case (n): \
+ if (edit_type == PG_EDIT_INCREMENT) \
+ v = do_set_increment (pg, s, buffers, n_buffers, \
+ BITS (u##n), \
+ l0, \
+ /* is_net_byte_order */ 1, \
+ /* want sum */ 0, 0, \
+ v_min, v_max, \
+ v); \
+ else if (edit_type == PG_EDIT_RANDOM) \
+ do_set_random (pg, s, buffers, n_buffers, \
+ BITS (u##n), \
+ l0, \
+ /* is_net_byte_order */ 1, \
+ /* want sum */ 0, 0, \
+ v_min, v_max); \
+ else /* edit_type == PG_EDIT_FIXED */ \
+ do_set_fixed (pg, s, buffers, n_buffers, \
+ BITS (u##n), \
+ l0, \
+ /* is_net_byte_order */ 1, \
+ v_min, v_max); \
+ goto done;
+
+ if (l1 == 0 && h1 == 0)
+ {
+ switch (max_bits)
+ {
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+ }
+ }
+
+#undef _
+
+ {
+ u64 mask;
+ u32 shift = l1;
+ u32 n_bits = max_bits;
+
+ max_bits = clib_max (max_pow2 (n_bits), 8);
+
+ mask = ((u64) 1 << (u64) n_bits) - 1;
+ mask &= ~(((u64) 1 << (u64) shift) - 1);
+
+ mask <<= max_bits - n_bits;
+ shift += max_bits - n_bits;
+
+ switch (max_bits)
+ {
+#define _(n) \
+ case (n): \
+ if (edit_type == PG_EDIT_INCREMENT) \
+ v = do_setbits_increment (pg, s, buffers, n_buffers, \
+ BITS (u##n), n_bits, \
+ l0, v_min, v_max, v, \
+ mask, shift); \
+ else if (edit_type == PG_EDIT_RANDOM) \
+ do_setbits_random (pg, s, buffers, n_buffers, \
+ BITS (u##n), n_bits, \
+ l0, v_min, v_max, \
+ mask, shift); \
+ else /* edit_type == PG_EDIT_FIXED */ \
+ do_setbits_fixed (pg, s, buffers, n_buffers, \
+ BITS (u##n), n_bits, \
+ l0, v_min, v_max, \
+ mask, shift); \
+ goto done;
+
+ _ (8);
+ _ (16);
+ _ (32);
+ _ (64);
+
+#undef _
+ }
+ }
+
+ done:
+ return v;
+}
+
+static void
+pg_generate_set_lengths (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers)
+{
+ u64 v_min, v_max, length_sum;
+ pg_edit_type_t edit_type;
+
+ v_min = s->min_packet_bytes;
+ v_max = s->max_packet_bytes;
+ edit_type = s->packet_size_edit_type;
+
+ if (edit_type == PG_EDIT_INCREMENT)
+ s->last_increment_packet_size
+ = do_set_increment (pg, s, buffers, n_buffers,
+ 8 * STRUCT_SIZE_OF (vlib_buffer_t, current_length),
+ STRUCT_OFFSET_OF (vlib_buffer_t, current_length),
+ /* is_net_byte_order */ 0,
+ /* want sum */ 1, &length_sum,
+ v_min, v_max,
+ s->last_increment_packet_size);
+
+ else if (edit_type == PG_EDIT_RANDOM)
+ do_set_random (pg, s, buffers, n_buffers,
+ 8 * STRUCT_SIZE_OF (vlib_buffer_t, current_length),
+ STRUCT_OFFSET_OF (vlib_buffer_t, current_length),
+ /* is_net_byte_order */ 0,
+ /* want sum */ 1, &length_sum,
+ v_min, v_max);
+
+ else /* edit_type == PG_EDIT_FIXED */
+ {
+ do_set_fixed (pg, s, buffers, n_buffers,
+ 8 * STRUCT_SIZE_OF (vlib_buffer_t, current_length),
+ STRUCT_OFFSET_OF (vlib_buffer_t, current_length),
+ /* is_net_byte_order */ 0,
+ v_min, v_max);
+ length_sum = v_min * n_buffers;
+ }
+
+ {
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * si = vnet_get_sw_interface (vnm, s->sw_if_index[VLIB_RX]);
+
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ si->sw_if_index,
+ n_buffers,
+ length_sum);
+ }
+
+ pg_set_mbuf_metadata (pg, buffers, n_buffers);
+}
+
+static void
+pg_generate_fix_multi_buffer_lengths (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ pg_buffer_index_t * pbi;
+ uword n_bytes_left;
+ static u32 * unused_buffers = 0;
+
+ while (n_buffers > 0)
+ {
+ vlib_buffer_t * b;
+ u32 bi;
+
+ bi = buffers[0];
+ b = vlib_get_buffer (vm, bi);
+
+ /* Current length here is length of whole packet. */
+ n_bytes_left = b->current_length;
+
+ pbi = s->buffer_indices;
+ while (1)
+ {
+ uword n = clib_min (n_bytes_left, s->buffer_bytes);
+
+ b->current_length = n;
+ n_bytes_left -= n;
+ if (n_bytes_left > 0)
+ b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ else
+ b->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+
+ /* Return unused buffers to fifos. */
+ if (n == 0)
+ vec_add1 (unused_buffers, bi);
+
+ pbi++;
+ if (pbi >= vec_end (s->buffer_indices))
+ break;
+
+ bi = b->next_buffer;
+ b = vlib_get_buffer (vm, bi);
+ }
+ ASSERT (n_bytes_left == 0);
+
+ buffers += 1;
+ n_buffers -= 1;
+ }
+
+ if (vec_len (unused_buffers) > 0)
+ {
+ vlib_buffer_free_no_next (vm, unused_buffers,
+ vec_len (unused_buffers));
+ _vec_len (unused_buffers) = 0;
+ }
+}
+
+static void
+pg_generate_edit (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers)
+{
+ pg_edit_t * e;
+
+ vec_foreach (e, s->non_fixed_edits)
+ {
+ switch (e->type)
+ {
+ case PG_EDIT_RANDOM:
+ case PG_EDIT_INCREMENT:
+ {
+ u32 lo_bit, hi_bit;
+ u64 v_min, v_max;
+
+ v_min = pg_edit_get_value (e, PG_EDIT_LO);
+ v_max = pg_edit_get_value (e, PG_EDIT_HI);
+
+ hi_bit = (BITS (u8) * STRUCT_OFFSET_OF (vlib_buffer_t, data)
+ + BITS (u8)
+ + e->lsb_bit_offset);
+ lo_bit = hi_bit - e->n_bits;
+
+ e->last_increment_value
+ = do_it (pg, s, buffers, n_buffers, lo_bit, hi_bit, v_min, v_max,
+ e->last_increment_value,
+ e->type);
+ }
+ break;
+
+ case PG_EDIT_UNSPECIFIED:
+ break;
+
+ default:
+ /* Should not be any fixed edits left. */
+ ASSERT (0);
+ break;
+ }
+ }
+
+ /* Call any edit functions to e.g. completely IP lengths, checksums, ... */
+ {
+ int i;
+ for (i = vec_len (s->edit_groups) - 1; i >= 0; i--)
+ {
+ pg_edit_group_t * g = s->edit_groups + i;
+ if (g->edit_function)
+ g->edit_function (pg, s, g, buffers, n_buffers);
+ }
+ }
+}
+
+static void
+pg_set_next_buffer_pointers (pg_main_t * pg,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 * next_buffers,
+ u32 n_buffers)
+{
+ vlib_main_t * vm = pg->vlib_main;
+
+ while (n_buffers >= 4)
+ {
+ u32 ni0, ni1;
+ vlib_buffer_t * b0, * b1;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ b1 = vlib_get_buffer (vm, buffers[1]);
+ ni0 = next_buffers[0];
+ ni1 = next_buffers[1];
+
+ vlib_prefetch_buffer_with_index (vm, buffers[2], WRITE);
+ vlib_prefetch_buffer_with_index (vm, buffers[3], WRITE);
+
+ b0->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b1->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b0->next_buffer = ni0;
+ b1->next_buffer = ni1;
+
+ buffers += 2;
+ next_buffers += 2;
+ n_buffers -= 2;
+ }
+
+ while (n_buffers > 0)
+ {
+ u32 ni0;
+ vlib_buffer_t * b0;
+
+ b0 = vlib_get_buffer (vm, buffers[0]);
+ ni0 = next_buffers[0];
+ buffers += 1;
+ next_buffers += 1;
+ n_buffers -= 1;
+
+ b0->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b0->next_buffer = ni0;
+ }
+}
+
+static_always_inline void
+init_replay_buffers_inline (vlib_main_t * vm,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 data_offset,
+ u32 n_data)
+{
+ u32 n_left, * b, i, l;
+
+ n_left = n_buffers;
+ b = buffers;
+ i = s->current_replay_packet_index;
+ l = vec_len (s->replay_packet_templates);
+
+ while (n_left >= 1)
+ {
+ u32 bi0, n0;
+ vlib_buffer_t * b0;
+ u8 * d0;
+
+ bi0 = b[0];
+ b += 1;
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
+ /* was s->sw_if_index[VLIB_TX]; */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ d0 = vec_elt (s->replay_packet_templates, i);
+
+ n0 = n_data;
+ if (data_offset + n_data >= vec_len (d0))
+ n0 = vec_len (d0) > data_offset ? vec_len (d0) - data_offset : 0;
+
+ b0->current_length = n0;
+
+ memcpy (b0->data, d0 + data_offset, n0);
+ i = i + 1 == l ? 0 : i + 1;
+ }
+}
+
+static_always_inline void
+init_buffers_inline (vlib_main_t * vm,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers,
+ u32 data_offset,
+ u32 n_data,
+ u32 set_data)
+{
+ u32 n_left, * b;
+ u8 * data, * mask;
+
+ if (vec_len (s->replay_packet_templates) > 0)
+ return init_replay_buffers_inline (vm, s, buffers, n_buffers, data_offset, n_data);
+
+ data = s->fixed_packet_data + data_offset;
+ mask = s->fixed_packet_data_mask + data_offset;
+ if (data + n_data >= vec_end (s->fixed_packet_data))
+ n_data = (data < vec_end (s->fixed_packet_data)
+ ? vec_end (s->fixed_packet_data) - data
+ : 0);
+ if (n_data > 0)
+ {
+ ASSERT (data + n_data <= vec_end (s->fixed_packet_data));
+ ASSERT (mask + n_data <= vec_end (s->fixed_packet_data_mask));
+ }
+
+ n_left = n_buffers;
+ b = buffers;
+
+ while (n_left >= 4)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+
+ /* Prefetch next iteration. */
+ vlib_prefetch_buffer_with_index (vm, b[2], STORE);
+ vlib_prefetch_buffer_with_index (vm, b[3], STORE);
+
+ bi0 = b[0];
+ bi1 = b[1];
+ b += 2;
+ n_left -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] =
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
+
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] =
+ vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ if (set_data)
+ {
+ memcpy (b0->data, data, n_data);
+ memcpy (b1->data, data, n_data);
+ }
+ else
+ {
+ ASSERT (validate_buffer_data2 (b0, s, data_offset, n_data));
+ ASSERT (validate_buffer_data2 (b1, s, data_offset, n_data));
+ }
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+
+ bi0 = b[0];
+ b += 1;
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = s->sw_if_index[VLIB_RX];
+ /* s->sw_if_index[VLIB_TX]; */
+ vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ if (set_data)
+ memcpy (b0->data, data, n_data);
+ else
+ ASSERT (validate_buffer_data2 (b0, s, data_offset, n_data));
+ }
+}
+
+static void pg_buffer_init (vlib_main_t * vm,
+ vlib_buffer_free_list_t * fl,
+ u32 * buffers,
+ u32 n_buffers)
+{
+ pg_main_t * pg = &pg_main;
+ pg_stream_t * s;
+ uword bi, si;
+
+ si = fl->buffer_init_function_opaque & pow2_mask (24);
+ bi = fl->buffer_init_function_opaque >> 24;
+
+ s = pool_elt_at_index (pg->streams, si);
+
+ init_buffers_inline (vm, s, buffers, n_buffers,
+ /* data_offset */ bi * s->buffer_bytes,
+ /* n_data */ s->buffer_bytes,
+ /* set_data */ 1);
+}
+
+static u32
+pg_stream_fill_helper (pg_main_t * pg,
+ pg_stream_t * s,
+ pg_buffer_index_t * bi,
+ u32 * buffers,
+ u32 * next_buffers,
+ u32 n_alloc)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ vlib_buffer_free_list_t * f;
+ uword is_start_of_packet = bi == s->buffer_indices;
+ u32 n_allocated;
+
+ f = vlib_buffer_get_free_list (vm, bi->free_list_index);
+
+ /*
+ * Historically, the pg maintained its own free lists and
+ * device drivers tx paths would return pkts. With the DPDK,
+ * that doesn't happen.
+ */
+ if (DPDK == 0 && ! (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE))
+ f->buffer_init_function = pg_buffer_init;
+ f->buffer_init_function_opaque =
+ (s - pg->streams) | ((bi - s->buffer_indices) << 24);
+
+ if (is_start_of_packet)
+ vnet_buffer (&f->buffer_init_template)->sw_if_index[VLIB_RX]
+ = vnet_main.local_interface_sw_if_index;
+
+ n_allocated = vlib_buffer_alloc_from_free_list (vm,
+ buffers,
+ n_alloc,
+ bi->free_list_index);
+ if (n_allocated == 0)
+ return 0;
+
+ /*
+ * We can't assume we got all the buffers we asked for...
+ * This never worked until recently.
+ */
+ n_alloc = n_allocated;
+
+ /* Reinitialize buffers */
+ if (DPDK == 0 || CLIB_DEBUG > 0
+ || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE))
+ init_buffers_inline
+ (vm, s,
+ buffers,
+ n_alloc,
+ (bi - s->buffer_indices) * s->buffer_bytes /* data offset */,
+ s->buffer_bytes,
+ /* set_data */
+ DPDK == 1 || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE) != 0);
+
+ /* $$$ this doesn't work at the moment */
+ ASSERT(next_buffers == 0);
+ if (next_buffers)
+ pg_set_next_buffer_pointers (pg, s, buffers, next_buffers, n_alloc);
+
+ if (is_start_of_packet)
+ {
+ if (vec_len (s->replay_packet_templates) > 0)
+ {
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_interface_main_t * im = &vnm->interface_main;
+ vnet_sw_interface_t * si =
+ vnet_get_sw_interface (vnm, s->sw_if_index[VLIB_RX]);
+ u32 l = 0;
+ u32 i;
+ for (i = 0; i < n_alloc; i++)
+ l += vlib_buffer_index_length_in_chain (vm, buffers[i]);
+ vlib_increment_combined_counter (im->combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ si->sw_if_index,
+ n_alloc,
+ l);
+ s->current_replay_packet_index += n_alloc;
+ s->current_replay_packet_index %=
+ vec_len (s->replay_packet_templates);
+ }
+ else
+ {
+ pg_generate_set_lengths (pg, s, buffers, n_alloc);
+ if (vec_len (s->buffer_indices) > 1)
+ pg_generate_fix_multi_buffer_lengths (pg, s, buffers, n_alloc);
+
+ pg_generate_edit (pg, s, buffers, n_alloc);
+ }
+ }
+
+ return n_alloc;
+}
+
+static u32
+pg_stream_fill (pg_main_t * pg, pg_stream_t * s, u32 n_buffers)
+{
+ pg_buffer_index_t * bi;
+ word i, n_in_fifo, n_alloc, n_free, n_added;
+ u32 * tail, * start, * end, * last_tail, * last_start;
+
+ bi = s->buffer_indices;
+
+ n_in_fifo = clib_fifo_elts (bi->buffer_fifo);
+ if (n_in_fifo >= n_buffers)
+ return n_in_fifo;
+
+ n_alloc = n_buffers - n_in_fifo;
+
+ /* Round up, but never generate more than limit. */
+ n_alloc = clib_max (VLIB_FRAME_SIZE, n_alloc);
+
+ if (s->n_packets_limit > 0
+ && s->n_packets_generated + n_in_fifo + n_alloc >= s->n_packets_limit)
+ {
+ n_alloc = s->n_packets_limit - s->n_packets_generated - n_in_fifo;
+ if (n_alloc < 0)
+ n_alloc = 0;
+ }
+
+ /* All buffer fifos should have the same size. */
+ if (CLIB_DEBUG > 0)
+ {
+ uword l = ~0, e;
+ vec_foreach (bi, s->buffer_indices)
+ {
+ e = clib_fifo_elts (bi->buffer_fifo);
+ if (bi == s->buffer_indices)
+ l = e;
+ ASSERT (l == e);
+ }
+ }
+
+ last_tail = last_start = 0;
+ n_added = n_alloc;
+
+ for (i = vec_len (s->buffer_indices) - 1; i >= 0; i--)
+ {
+ bi = vec_elt_at_index (s->buffer_indices, i);
+
+ n_free = clib_fifo_free_elts (bi->buffer_fifo);
+ if (n_free < n_alloc)
+ clib_fifo_resize (bi->buffer_fifo, n_alloc - n_free);
+
+ tail = clib_fifo_advance_tail (bi->buffer_fifo, n_alloc);
+ start = bi->buffer_fifo;
+ end = clib_fifo_end (bi->buffer_fifo);
+
+ if (tail + n_alloc <= end)
+ {
+ n_added = pg_stream_fill_helper (pg, s, bi, tail, last_tail, n_alloc);
+ }
+ else
+ {
+ u32 n = clib_min (end - tail, n_alloc);
+ n_added = pg_stream_fill_helper (pg, s, bi, tail, last_tail, n);
+
+ if (n_added == n && n_alloc > n_added)
+ {
+ n_added += pg_stream_fill_helper
+ (pg, s, bi, start, last_start, n_alloc - n_added);
+ }
+ }
+
+ if (PREDICT_FALSE (n_added < n_alloc))
+ tail = clib_fifo_advance_tail (bi->buffer_fifo, n_added - n_alloc);
+
+ last_tail = tail;
+ last_start = start;
+
+ /* Verify that pkts in the fifo are properly allocated */
+#if DPDK == 1
+ if (CLIB_DEBUG > 0)
+ {
+ u32 *bi0;
+ vlib_main_t * vm = vlib_get_main();
+ clib_fifo_foreach (bi0, bi->buffer_fifo,
+ ({
+ vlib_buffer_t * b;
+ struct rte_mbuf *mb;
+
+ b = vlib_get_buffer(vm, bi0[0]);
+ mb = (struct rte_mbuf *)b - 1;
+ ASSERT(rte_mbuf_refcnt_read(mb) == 1);
+ }));
+ }
+#endif
+ }
+
+ return n_in_fifo + n_added;
+}
+
+typedef struct {
+ u32 stream_index;
+
+ u32 packet_length;
+
+ /* Use pre data for packet data. */
+ vlib_buffer_t buffer;
+} pg_input_trace_t;
+
+static u8 * format_pg_input_trace (u8 * s, va_list * va)
+{
+ vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ pg_input_trace_t * t = va_arg (*va, pg_input_trace_t *);
+ pg_main_t * pg = &pg_main;
+ pg_stream_t * stream;
+ vlib_node_t * n;
+ uword indent = format_get_indent (s);
+
+ stream = 0;
+ if (! pool_is_free_index (pg->streams, t->stream_index))
+ stream = pool_elt_at_index (pg->streams, t->stream_index);
+
+ if (stream)
+ s = format (s, "stream %v", pg->streams[t->stream_index].name);
+ else
+ s = format (s, "stream %d", t->stream_index);
+
+ s = format (s, ", %d bytes", t->packet_length);
+
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ format_vlib_buffer, &t->buffer);
+
+ s = format (s, "\n%U",
+ format_white_space, indent);
+
+ n = 0;
+ if (stream)
+ n = vlib_get_node (vm, stream->node_index);
+
+ if (n && n->format_buffer)
+ s = format (s, "%U", n->format_buffer,
+ t->buffer.pre_data,
+ sizeof (t->buffer.pre_data));
+ else
+ s = format (s, "%U",
+ format_hex_bytes, t->buffer.pre_data,
+ ARRAY_LEN (t->buffer.pre_data));
+ return s;
+}
+
+static void
+pg_input_trace (pg_main_t * pg,
+ vlib_node_runtime_t * node,
+ pg_stream_t * s,
+ u32 * buffers,
+ u32 n_buffers)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 * b, n_left, stream_index, next_index;
+
+ n_left = n_buffers;
+ b = buffers;
+ stream_index = s - pg->streams;
+ next_index = s->next_index;
+
+ while (n_left >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ pg_input_trace_t * t0, * t1;
+
+ bi0 = b[0];
+ bi1 = b[1];
+ b += 2;
+ n_left -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ vlib_trace_buffer (vm, node, next_index, b0, /* follow_chain */ 1);
+ vlib_trace_buffer (vm, node, next_index, b1, /* follow_chain */ 1);
+
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+ t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0]));
+
+ t0->stream_index = stream_index;
+ t1->stream_index = stream_index;
+
+ t0->packet_length = vlib_buffer_length_in_chain (vm, b0);
+ t1->packet_length = vlib_buffer_length_in_chain (vm, b1);
+
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b1->pre_data));
+
+ memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
+ memcpy (t1->buffer.pre_data, b1->data, sizeof (t1->buffer.pre_data));
+ }
+
+ while (n_left >= 1)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ pg_input_trace_t * t0;
+
+ bi0 = b[0];
+ b += 1;
+ n_left -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ vlib_trace_buffer (vm, node, next_index, b0, /* follow_chain */ 1);
+ t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
+
+ t0->stream_index = stream_index;
+ t0->packet_length = vlib_buffer_length_in_chain (vm, b0);
+ memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
+ memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
+ }
+}
+
+static uword
+pg_generate_packets (vlib_node_runtime_t * node,
+ pg_main_t * pg,
+ pg_stream_t * s,
+ uword n_packets_to_generate)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ u32 * to_next, n_this_frame, n_left, n_trace, n_packets_in_fifo;
+ uword n_packets_generated;
+ pg_buffer_index_t * bi, * bi0;
+
+ bi0 = s->buffer_indices;
+
+ n_packets_in_fifo = pg_stream_fill (pg, s, n_packets_to_generate);
+ n_packets_to_generate = clib_min (n_packets_in_fifo, n_packets_to_generate);
+ n_packets_generated = 0;
+
+ while (n_packets_to_generate > 0)
+ {
+ u32 * head, * start, * end;
+
+ vlib_get_next_frame (vm, node, s->next_index, to_next, n_left);
+
+ n_this_frame = n_packets_to_generate;
+ if (n_this_frame > n_left)
+ n_this_frame = n_left;
+
+ start = bi0->buffer_fifo;
+ end = clib_fifo_end (bi0->buffer_fifo);
+ head = clib_fifo_head (bi0->buffer_fifo);
+
+ if (head + n_this_frame <= end)
+ vlib_copy_buffers (to_next, head, n_this_frame);
+ else
+ {
+ u32 n = end - head;
+ vlib_copy_buffers (to_next + 0, head, n);
+ vlib_copy_buffers (to_next + n, start, n_this_frame - n);
+ }
+
+ vec_foreach (bi, s->buffer_indices)
+ clib_fifo_advance_head (bi->buffer_fifo, n_this_frame);
+
+ n_trace = vlib_get_trace_count (vm, node);
+ if (n_trace > 0)
+ {
+ u32 n = clib_min (n_trace, n_this_frame);
+ pg_input_trace (pg, node, s, to_next, n);
+ vlib_set_trace_count (vm, node, n_trace - n);
+ }
+ n_packets_to_generate -= n_this_frame;
+ n_packets_generated += n_this_frame;
+ n_left -= n_this_frame;
+ vlib_put_next_frame (vm, node, s->next_index, n_left);
+ }
+
+ return n_packets_generated;
+}
+
+static uword
+pg_input_stream (vlib_node_runtime_t * node,
+ pg_main_t * pg,
+ pg_stream_t * s)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ uword n_packets;
+ f64 time_now, dt;
+
+ if (s->n_packets_limit > 0
+ && s->n_packets_generated >= s->n_packets_limit)
+ {
+ pg_stream_enable_disable (pg, s, /* want_enabled */ 0);
+ return 0;
+ }
+
+ /* Apply rate limit. */
+ time_now = vlib_time_now (vm);
+ if (s->time_last_generate == 0)
+ s->time_last_generate = time_now;
+
+ dt = time_now - s->time_last_generate;
+ s->time_last_generate = time_now;
+
+ n_packets = VLIB_FRAME_SIZE;
+ if (s->rate_packets_per_second > 0)
+ {
+ s->packet_accumulator += dt * s->rate_packets_per_second;
+ n_packets = s->packet_accumulator;
+
+ /* Never allow accumulator to grow if we get behind. */
+ s->packet_accumulator -= n_packets;
+ }
+
+ /* Apply fixed limit. */
+ if (s->n_packets_limit > 0
+ && s->n_packets_generated + n_packets > s->n_packets_limit)
+ n_packets = s->n_packets_limit - s->n_packets_generated;
+
+ /* Generate up to one frame's worth of packets. */
+ if (n_packets > VLIB_FRAME_SIZE)
+ n_packets = VLIB_FRAME_SIZE;
+
+ if (n_packets > 0)
+ n_packets = pg_generate_packets (node, pg, s, n_packets);
+
+ s->n_packets_generated += n_packets;
+
+ return n_packets;
+}
+
+uword
+pg_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ uword i;
+ pg_main_t * pg = &pg_main;
+ uword n_packets = 0;
+
+ clib_bitmap_foreach (i, pg->enabled_streams, ({
+ n_packets += pg_input_stream (node, pg, vec_elt_at_index (pg->streams, i));
+ }));
+
+ return n_packets;
+}
+
+VLIB_REGISTER_NODE (pg_input_node) = {
+ .function = pg_input,
+ .name = "pg-input",
+ .type = VLIB_NODE_TYPE_INPUT,
+
+ .format_trace = format_pg_input_trace,
+
+ /* Input node will be left disabled until a stream is active. */
+ .state = VLIB_NODE_STATE_DISABLED,
+};
diff --git a/vnet/vnet/pg/output.c b/vnet/vnet/pg/output.c
new file mode 100644
index 00000000000..cc098da21c6
--- /dev/null
+++ b/vnet/vnet/pg/output.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_output.c: packet generator output
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+
+uword
+pg_output (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_buffers = frame->n_vectors;
+ vlib_buffer_free_no_next (vm, buffers, n_buffers);
+ return n_buffers;
+}
diff --git a/vnet/vnet/pg/pg.h b/vnet/vnet/pg/pg.h
new file mode 100644
index 00000000000..63bfb18abca
--- /dev/null
+++ b/vnet/vnet/pg/pg.h
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg.h: VLIB packet generator
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_pg_h
+#define included_vlib_pg_h
+
+#include <vlib/vlib.h> /* for VLIB_N_RX_TX */
+#include <vnet/pg/edit.h>
+#include <vppinfra/fifo.h> /* for buffer_fifo */
+
+struct pg_main_t;
+struct pg_stream_t;
+
+typedef struct pg_edit_group_t {
+ /* Edits in this group. */
+ pg_edit_t * edits;
+
+ /* Vector of non-fixed edits for this group. */
+ pg_edit_t * non_fixed_edits;
+
+ /* Fixed edits for this group. */
+ u8 * fixed_packet_data;
+ u8 * fixed_packet_data_mask;
+
+ /* Byte offset where packet data begins. */
+ u32 start_byte_offset;
+
+ /* Number of packet bytes for this edit group. */
+ u32 n_packet_bytes;
+
+ /* Function to perform miscellaneous edits (e.g. set IP checksum, ...). */
+ void (* edit_function) (struct pg_main_t * pg,
+ struct pg_stream_t * s,
+ struct pg_edit_group_t * g,
+ u32 * buffers,
+ u32 n_buffers);
+
+ /* Opaque data for edit function's use. */
+ uword edit_function_opaque;
+} pg_edit_group_t;
+
+/* Packets are made of multiple buffers chained together.
+ This struct keeps track of data per-chain index. */
+typedef struct {
+ /* Vector of buffer edits for this stream and buffer index. */
+ pg_edit_t * edits;
+
+ /* Buffers pre-initialized with fixed buffer data for this stream. */
+ u32 * buffer_fifo;
+
+ /* Buffer free list for this buffer index in stream. */
+ u32 free_list_index;
+} pg_buffer_index_t;
+
+typedef struct pg_stream_t {
+ /* Stream name. */
+ u8 * name;
+
+ u32 flags;
+
+ /* Stream is currently enabled. */
+#define PG_STREAM_FLAGS_IS_ENABLED (1 << 0)
+#define PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE (1 << 1)
+
+ /* Edit groups are created by each protocol level (e.g. ethernet,
+ ip4, tcp, ...). */
+ pg_edit_group_t * edit_groups;
+
+ pg_edit_type_t packet_size_edit_type;
+
+ /* Min/max packet size. */
+ u32 min_packet_bytes, max_packet_bytes;
+
+ /* Vector of non-fixed edits for this stream.
+ All fixed edits are performed and placed into fixed_packet_data. */
+ pg_edit_t * non_fixed_edits;
+
+ /* Packet data with all fixed edits performed.
+ All packets in stream are initialized according with this data.
+ Mask specifies which bits of packet data are covered by fixed edits. */
+ u8 * fixed_packet_data, * fixed_packet_data_mask;
+
+ /* Size to use for buffers. 0 means use buffers big enough
+ for max_packet_bytes. */
+ u32 buffer_bytes;
+
+ /* Last packet length if packet size edit type is increment. */
+ u32 last_increment_packet_size;
+
+ /* Index into main interface pool for this stream. */
+ u32 pg_if_index;
+
+ /* Interface used to mark packets for this stream. May be different
+ than hw/sw index from pg main interface pool. They will be
+ different if this stream is being used generate buffers as if
+ they were received on a non-pg interface. For example, suppose you
+ are trying to test vlan code and you want to generate buffers that
+ appear to come from an ethernet interface. */
+ u32 sw_if_index[VLIB_N_RX_TX];
+
+ /* Node where stream's buffers get put. */
+ u32 node_index;
+
+ /* Output next index to reach output node from stream input node. */
+ u32 next_index;
+
+ /* Number of packets currently generated. */
+ u64 n_packets_generated;
+
+ /* Stream is disabled when packet limit is reached.
+ Zero means no packet limit. */
+ u64 n_packets_limit;
+
+ /* Rate for this stream in packets/second.
+ Zero means unlimited rate. */
+ f64 rate_packets_per_second;
+
+ f64 time_last_generate;
+
+ f64 packet_accumulator;
+
+ pg_buffer_index_t * buffer_indices;
+
+ u8 ** replay_packet_templates;
+ u32 current_replay_packet_index;
+} pg_stream_t;
+
+always_inline void
+pg_buffer_index_free (pg_buffer_index_t * bi)
+{
+ vec_free (bi->edits);
+ clib_fifo_free (bi->buffer_fifo);
+}
+
+always_inline void
+pg_edit_group_free (pg_edit_group_t * g)
+{
+ pg_edit_t * e;
+ vec_foreach (e, g->edits)
+ pg_edit_free (e);
+ vec_free (g->edits);
+ vec_free (g->fixed_packet_data);
+ vec_free (g->fixed_packet_data_mask);
+}
+
+always_inline void
+pg_stream_free (pg_stream_t * s)
+{
+ pg_edit_group_t * g;
+ pg_edit_t * e;
+ vec_foreach (e, s->non_fixed_edits)
+ pg_edit_free (e);
+ vec_free (s->non_fixed_edits);
+ vec_foreach (g, s->edit_groups)
+ pg_edit_group_free (g);
+ vec_free (s->edit_groups);
+ vec_free (s->fixed_packet_data);
+ vec_free (s->fixed_packet_data_mask);
+ vec_free (s->name);
+
+ {
+ pg_buffer_index_t * bi;
+ vec_foreach (bi, s->buffer_indices)
+ pg_buffer_index_free (bi);
+ vec_free (s->buffer_indices);
+ }
+}
+
+always_inline int
+pg_stream_is_enabled (pg_stream_t * s)
+{ return (s->flags & PG_STREAM_FLAGS_IS_ENABLED) != 0; }
+
+always_inline pg_edit_group_t *
+pg_stream_get_group (pg_stream_t * s, u32 group_index)
+{ return vec_elt_at_index (s->edit_groups, group_index); }
+
+always_inline void *
+pg_create_edit_group (pg_stream_t * s,
+ int n_edit_bytes,
+ int n_packet_bytes,
+ u32 * group_index)
+{
+ pg_edit_group_t * g;
+ int n_edits;
+
+ vec_add2 (s->edit_groups, g, 1);
+ if (group_index)
+ *group_index = g - s->edit_groups;
+
+ ASSERT (n_edit_bytes % sizeof (pg_edit_t) == 0);
+ n_edits = n_edit_bytes / sizeof (pg_edit_t);
+ vec_resize (g->edits, n_edits);
+
+ g->n_packet_bytes = n_packet_bytes;
+
+ return g->edits;
+}
+
+always_inline void *
+pg_add_edits (pg_stream_t * s, int n_edit_bytes, int n_packet_bytes,
+ u32 group_index)
+{
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ pg_edit_t * e;
+ int n_edits;
+ ASSERT (n_edit_bytes % sizeof (pg_edit_t) == 0);
+ n_edits = n_edit_bytes / sizeof (pg_edit_t);
+ vec_add2 (g->edits, e, n_edits);
+ g->n_packet_bytes += n_packet_bytes;
+ return e;
+}
+
+always_inline void *
+pg_get_edit_group (pg_stream_t * s, u32 group_index)
+{
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ return g->edits;
+}
+
+/* Number of bytes for all groups >= given group. */
+always_inline uword
+pg_edit_group_n_bytes (pg_stream_t * s, u32 group_index)
+{
+ pg_edit_group_t * g;
+ uword n_bytes = 0;
+
+ for (g = s->edit_groups + group_index; g < vec_end (s->edit_groups); g++)
+ n_bytes += g->n_packet_bytes;
+ return n_bytes;
+}
+
+always_inline void
+pg_free_edit_group (pg_stream_t * s)
+{
+ uword i = vec_len (s->edit_groups) - 1;
+ pg_edit_group_t * g = pg_stream_get_group (s, i);
+
+ pg_edit_group_free (g);
+ memset (g, 0, sizeof (g[0]));
+ _vec_len (s->edit_groups) = i;
+}
+
+typedef struct {
+ /* VLIB interface indices. */
+ u32 hw_if_index, sw_if_index;
+
+ /* Identifies stream for this interface. */
+ u32 stream_index;
+} pg_interface_t;
+
+/* Per VLIB node data. */
+typedef struct {
+ /* Parser function indexed by node index. */
+ unformat_function_t * unformat_edit;
+} pg_node_t;
+
+typedef struct pg_main_t {
+ /* Back pointer to main structure. */
+ vlib_main_t * vlib_main;
+
+ /* Pool of streams. */
+ pg_stream_t * streams;
+
+ /* Bitmap indicating which streams are currently enabled. */
+ uword * enabled_streams;
+
+ /* Hash mapping name -> stream index. */
+ uword * stream_index_by_name;
+
+ /* Vector of interfaces. */
+ pg_interface_t * interfaces;
+
+ /* Per VLIB node information. */
+ pg_node_t * nodes;
+
+ u32 * free_interfaces;
+} pg_main_t;
+
+/* Global main structure. */
+extern pg_main_t pg_main;
+
+/* Global node. */
+extern vlib_node_registration_t pg_input_node;
+
+/* Buffer generator input, output node functions. */
+vlib_node_function_t pg_input, pg_output;
+
+/* Stream add/delete. */
+void pg_stream_del (pg_main_t * pg, uword index);
+void pg_stream_add (pg_main_t * pg, pg_stream_t * s_init);
+
+/* Enable/disable stream. */
+void pg_stream_enable_disable (pg_main_t * pg, pg_stream_t * s, int is_enable);
+
+/* Find/create free packet-generator interface index. */
+u32 pg_interface_find_free (pg_main_t * pg, uword stream_index);
+
+always_inline pg_node_t *
+pg_get_node (uword node_index)
+{
+ pg_main_t * pg = &pg_main;
+ vec_validate (pg->nodes, node_index);
+ return pg->nodes + node_index;
+}
+
+void pg_edit_group_get_fixed_packet_data (pg_stream_t * s,
+ u32 group_index,
+ void * fixed_packet_data,
+ void * fixed_packet_data_mask);
+
+#endif /* included_vlib_pg_h */
diff --git a/vnet/vnet/pg/stream.c b/vnet/vnet/pg/stream.c
new file mode 100644
index 00000000000..1dd5624338f
--- /dev/null
+++ b/vnet/vnet/pg/stream.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pg_stream.c: packet generator streams
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+
+/* Mark stream active or inactive. */
+void pg_stream_enable_disable (pg_main_t * pg, pg_stream_t * s, int want_enabled)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ pg_interface_t * pi = vec_elt_at_index (pg->interfaces, s->pg_if_index);
+
+ want_enabled = want_enabled != 0;
+
+ if (pg_stream_is_enabled (s) == want_enabled)
+ /* No change necessary. */
+ return;
+
+ if (want_enabled)
+ s->n_packets_generated = 0;
+
+ /* Toggle enabled flag. */
+ s->flags ^= PG_STREAM_FLAGS_IS_ENABLED;
+
+ ASSERT (! pool_is_free (pg->streams, s));
+
+ pg->enabled_streams
+ = clib_bitmap_set (pg->enabled_streams, s - pg->streams, want_enabled);
+
+ vnet_hw_interface_set_flags (vnm, pi->hw_if_index,
+ (want_enabled
+ ? VNET_HW_INTERFACE_FLAG_LINK_UP
+ : 0));
+
+ vnet_sw_interface_set_flags (vnm, pi->sw_if_index,
+ (want_enabled
+ ? VNET_SW_INTERFACE_FLAG_ADMIN_UP
+ : 0));
+
+ vlib_node_set_state (pg->vlib_main,
+ pg_input_node.index,
+ (clib_bitmap_is_zero (pg->enabled_streams)
+ ? VLIB_NODE_STATE_DISABLED
+ : VLIB_NODE_STATE_POLLING));
+
+ s->packet_accumulator = 0;
+ s->time_last_generate = 0;
+}
+
+static u8 * format_pg_interface_name (u8 * s, va_list * args)
+{
+ pg_main_t * pg = &pg_main;
+ u32 if_index = va_arg (*args, u32);
+ pg_interface_t * pi;
+
+ pi = vec_elt_at_index (pg->interfaces, if_index);
+ s = format (s, "pg/stream-%d", pi->stream_index);
+
+ return s;
+}
+
+VNET_DEVICE_CLASS (pg_dev_class,static) = {
+ .name = "pg",
+ .tx_function = pg_output,
+ .format_device_name = format_pg_interface_name,
+};
+
+static uword pg_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ u16 * h = rewrite;
+
+ if (max_rewrite_bytes < sizeof (h[0]))
+ return 0;
+
+ h[0] = clib_host_to_net_u16 (l3_type);
+ return sizeof (h[0]);
+}
+
+VNET_HW_INTERFACE_CLASS (pg_interface_class,static) = {
+ .name = "Packet generator",
+ .set_rewrite = pg_set_rewrite,
+};
+
+u32 pg_interface_find_free (pg_main_t * pg, uword stream_index)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ pg_interface_t * pi;
+ vnet_hw_interface_t * hi;
+ u32 i, l;
+
+ if ((l = vec_len (pg->free_interfaces)) > 0)
+ {
+ i = pg->free_interfaces[l - 1];
+ _vec_len (pg->free_interfaces) = l - 1;
+ pi = vec_elt_at_index (pg->interfaces, i);
+ pi->stream_index = stream_index;
+ }
+ else
+ {
+ i = vec_len (pg->interfaces);
+ vec_add2 (pg->interfaces, pi, 1);
+
+ pi->stream_index = stream_index;
+ pi->hw_if_index = vnet_register_interface (vnm,
+ pg_dev_class.index, i,
+ pg_interface_class.index, stream_index);
+ hi = vnet_get_hw_interface (vnm, pi->hw_if_index);
+ pi->sw_if_index = hi->sw_if_index;
+ }
+
+ return i;
+}
+
+static void do_edit (pg_stream_t * stream,
+ pg_edit_group_t * g,
+ pg_edit_t * e,
+ uword want_commit)
+{
+ u32 i, i0, i1, mask, n_bits_left;
+ u8 * v, * s, * m;
+
+ i0 = e->lsb_bit_offset / BITS (u8);
+
+ /* Make space for edit in value and mask. */
+ vec_validate (g->fixed_packet_data, i0);
+ vec_validate (g->fixed_packet_data_mask, i0);
+
+ if (e->type != PG_EDIT_FIXED)
+ {
+ switch (e->type)
+ {
+ case PG_EDIT_RANDOM:
+ case PG_EDIT_INCREMENT:
+ e->last_increment_value = pg_edit_get_value (e, PG_EDIT_LO);
+ break;
+
+ default:
+ break;
+ }
+
+ if (want_commit)
+ {
+ ASSERT(e->type != PG_EDIT_INVALID_TYPE);
+ vec_add1 (g->non_fixed_edits, e[0]);
+ }
+ return;
+ }
+
+ s = g->fixed_packet_data;
+ m = g->fixed_packet_data_mask;
+
+ n_bits_left = e->n_bits;
+ i0 = e->lsb_bit_offset / BITS (u8);
+ i1 = e->lsb_bit_offset % BITS (u8);
+
+ v = e->values[PG_EDIT_LO];
+ i = pg_edit_n_alloc_bytes (e) - 1;
+
+ /* Odd low order bits?. */
+ if (i1 != 0 && n_bits_left > 0)
+ {
+ u32 n = clib_min (n_bits_left, BITS (u8) - i1);
+
+ mask = pow2_mask (n) << i1;
+
+ ASSERT (i0 < vec_len (s));
+ ASSERT (i < vec_len (v));
+ ASSERT ((v[i] &~ mask) == 0);
+
+ s[i0] |= v[i] & mask;
+ m[i0] |= mask;
+
+ i0--;
+ i--;
+ n_bits_left -= n;
+ }
+
+ /* Even bytes. */
+ while (n_bits_left >= 8)
+ {
+ ASSERT (i0 < vec_len (s));
+ ASSERT (i < vec_len (v));
+
+ s[i0] = v[i];
+ m[i0] = ~0;
+
+ i0--;
+ i--;
+ n_bits_left -= 8;
+ }
+
+ /* Odd high order bits. */
+ if (n_bits_left > 0)
+ {
+ mask = pow2_mask (n_bits_left);
+
+ ASSERT (i0 < vec_len (s));
+ ASSERT (i < vec_len (v));
+ ASSERT ((v[i] &~ mask) == 0);
+
+ s[i0] |= v[i] & mask;
+ m[i0] |= mask;
+ }
+
+ if (want_commit)
+ pg_edit_free (e);
+}
+
+void pg_edit_group_get_fixed_packet_data (pg_stream_t * s,
+ u32 group_index,
+ void * packet_data,
+ void * packet_data_mask)
+{
+ pg_edit_group_t * g = pg_stream_get_group (s, group_index);
+ pg_edit_t * e;
+
+ vec_foreach (e, g->edits)
+ do_edit (s, g, e, /* want_commit */ 0);
+
+ memcpy (packet_data, g->fixed_packet_data, vec_len (g->fixed_packet_data));
+ memcpy (packet_data_mask, g->fixed_packet_data_mask, vec_len (g->fixed_packet_data_mask));
+}
+
+static void perform_fixed_edits (pg_stream_t * s)
+{
+ pg_edit_group_t * g;
+ pg_edit_t * e;
+ word i;
+
+ for (i = vec_len (s->edit_groups) - 1; i >= 0; i--)
+ {
+ g = vec_elt_at_index (s->edit_groups, i);
+ vec_foreach (e, g->edits)
+ do_edit (s, g, e, /* want_commit */ 1);
+
+ /* All edits have either been performed or added to
+ g->non_fixed_edits. So, we can delete the vector. */
+ vec_free (g->edits);
+ }
+
+ vec_free (s->fixed_packet_data_mask);
+ vec_free (s->fixed_packet_data);
+ vec_foreach (g, s->edit_groups)
+ {
+ int i;
+ g->start_byte_offset = vec_len (s->fixed_packet_data);
+
+ /* Relocate and copy non-fixed edits from group to stream. */
+ vec_foreach (e, g->non_fixed_edits)
+ e->lsb_bit_offset += g->start_byte_offset * BITS (u8);
+
+ for (i = 0; i < vec_len (g->non_fixed_edits); i++)
+ ASSERT(g->non_fixed_edits[i].type != PG_EDIT_INVALID_TYPE);
+
+ vec_add (s->non_fixed_edits,
+ g->non_fixed_edits,
+ vec_len (g->non_fixed_edits));
+ vec_free (g->non_fixed_edits);
+
+ vec_add (s->fixed_packet_data,
+ g->fixed_packet_data,
+ vec_len (g->fixed_packet_data));
+ vec_add (s->fixed_packet_data_mask,
+ g->fixed_packet_data_mask,
+ vec_len (g->fixed_packet_data_mask));
+ }
+}
+
+void pg_stream_add (pg_main_t * pg, pg_stream_t * s_init)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ pg_stream_t * s;
+ uword * p;
+
+ if (! pg->stream_index_by_name)
+ pg->stream_index_by_name
+ = hash_create_vec (0, sizeof (s->name[0]), sizeof (uword));
+
+ /* Delete any old stream with the same name. */
+ if (s_init->name
+ && (p = hash_get_mem (pg->stream_index_by_name, s_init->name)))
+ {
+ pg_stream_del (pg, p[0]);
+ }
+
+ pool_get (pg->streams, s);
+ s[0] = s_init[0];
+
+ /* Give it a name. */
+ if (! s->name)
+ s->name = format (0, "stream%d", s - pg->streams);
+ else
+ s->name = vec_dup (s->name);
+
+ hash_set_mem (pg->stream_index_by_name, s->name, s - pg->streams);
+
+ /* Get fixed part of buffer data. */
+ perform_fixed_edits (s);
+
+ /* Determine packet size. */
+ switch (s->packet_size_edit_type)
+ {
+ case PG_EDIT_INCREMENT:
+ case PG_EDIT_RANDOM:
+ if (s->min_packet_bytes == s->max_packet_bytes)
+ s->packet_size_edit_type = PG_EDIT_FIXED;
+ break;
+
+ default:
+ /* Get packet size from fixed edits. */
+ s->packet_size_edit_type = PG_EDIT_FIXED;
+ if (! s->replay_packet_templates)
+ s->min_packet_bytes = s->max_packet_bytes = vec_len (s->fixed_packet_data);
+ break;
+ }
+
+ s->last_increment_packet_size = s->min_packet_bytes;
+
+ {
+ pg_buffer_index_t * bi;
+ int n;
+
+ if (! s->buffer_bytes)
+ s->buffer_bytes = s->max_packet_bytes;
+
+ s->buffer_bytes = vlib_buffer_round_size (s->buffer_bytes);
+
+ n = s->max_packet_bytes / s->buffer_bytes;
+ n += (s->max_packet_bytes % s->buffer_bytes) != 0;
+
+ vec_resize (s->buffer_indices, n);
+
+ vec_foreach (bi, s->buffer_indices)
+ bi->free_list_index = vlib_buffer_create_free_list (vm, s->buffer_bytes,
+ "pg stream %d buffer #%d",
+ s - pg->streams,
+ 1 + (bi - s->buffer_indices));
+ }
+
+ /* Find an interface to use. */
+ s->pg_if_index = pg_interface_find_free (pg, s - pg->streams);
+
+ {
+ pg_interface_t * pi = vec_elt_at_index (pg->interfaces, s->pg_if_index);
+ vlib_rx_or_tx_t rx_or_tx;
+
+ vlib_foreach_rx_tx (rx_or_tx)
+ {
+ if (s->sw_if_index[rx_or_tx] == ~0)
+ s->sw_if_index[rx_or_tx] = pi->sw_if_index;
+ }
+ }
+
+ /* Connect the graph. */
+ s->next_index = vlib_node_add_next (vm, pg_input_node.index, s->node_index);
+}
+
+void pg_stream_del (pg_main_t * pg, uword index)
+{
+ vlib_main_t * vm = pg->vlib_main;
+ pg_stream_t * s;
+ pg_buffer_index_t * bi;
+
+ s = pool_elt_at_index (pg->streams, index);
+
+ pg_stream_enable_disable (pg, s, /* want_enabled */ 0);
+ vec_add1 (pg->free_interfaces, s->pg_if_index);
+ hash_unset_mem (pg->stream_index_by_name, s->name);
+
+ vec_foreach (bi, s->buffer_indices)
+ {
+ vlib_buffer_delete_free_list (vm, bi->free_list_index);
+ clib_fifo_free (bi->buffer_fifo);
+ }
+
+ pg_stream_free (s);
+ pool_put (pg->streams, s);
+}
+
diff --git a/vnet/vnet/pipeline.h b/vnet/vnet/pipeline.h
new file mode 100644
index 00000000000..5a0d4dcc616
--- /dev/null
+++ b/vnet/vnet/pipeline.h
@@ -0,0 +1,453 @@
+/*
+ * vnet/pipeline.h: software pipeline
+ *
+ * Copyright (c) 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Usage example.
+ *
+ * #define NSTAGES 3 or whatever
+ *
+ * <Define pipeline stages>
+ *
+ * #include <vnet/pipeline.h>
+ *
+ * static uword my_node_fn (vlib_main_t * vm,
+ * vlib_node_runtime_t * node,
+ * vlib_frame_t * frame)
+ * {
+ * return dispatch_pipeline (vm, node, frame);
+ * }
+ *
+ */
+
+#ifndef NSTAGES
+#error files which #include <vnet/pipeline.h> must define NSTAGES
+#endif
+
+#ifndef STAGE_INLINE
+#define STAGE_INLINE inline
+#endif
+
+/*
+ * A prefetch stride of 2 is quasi-equivalent to doubling the number
+ * of stages with every other pipeline stage empty.
+ */
+
+/*
+ * This is a typical first pipeline stage, which prefetches
+ * buffer metadata and the first line of pkt data.
+ * To use it:
+ * #define stage0 generic_stage0
+ */
+static STAGE_INLINE void generic_stage0 (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ u32 buffer_index)
+{
+ /* generic default stage 0 here */
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+ vlib_prefetch_buffer_header (b, STORE);
+ CLIB_PREFETCH (b->data, CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+#if NSTAGES == 2
+
+static STAGE_INLINE uword
+dispatch_pipeline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index, next0;
+ int pi, pi_limit;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ pi_limit = clib_min (n_left_from, n_left_to_next);
+
+ for (pi = 0; pi < NSTAGES-1; pi++)
+ {
+ if(pi == pi_limit)
+ break;
+ stage0 (vm, node, from[pi]);
+ }
+
+ for (; pi < pi_limit; pi++)
+ {
+ stage0 (vm, node, from[pi]);
+ to_next[0] = from [pi - 1];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 1]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 1], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+
+ for (; pi < (pi_limit + (NSTAGES-1)); pi++)
+ {
+ if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
+ {
+ to_next[0] = from [pi - 1];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 1]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 1], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ }
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ from += pi_limit;
+ }
+ return frame->n_vectors;
+}
+#endif
+
+#if NSTAGES == 3
+static STAGE_INLINE uword
+dispatch_pipeline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index, next0;
+ int pi, pi_limit;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ pi_limit = clib_min (n_left_from, n_left_to_next);
+
+ for (pi = 0; pi < NSTAGES-1; pi++)
+ {
+ if(pi == pi_limit)
+ break;
+ stage0 (vm, node, from[pi]);
+ if (pi-1 >= 0)
+ stage1 (vm, node, from[pi-1]);
+ }
+
+ for (; pi < pi_limit; pi++)
+ {
+ stage0 (vm, node, from[pi]);
+ stage1 (vm, node, from[pi-1]);
+ to_next[0] = from [pi - 2];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 2]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 2], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+
+
+ for (; pi < (pi_limit + (NSTAGES-1)); pi++)
+ {
+ if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
+ stage1 (vm, node, from[pi-1]);
+ if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
+ {
+ to_next[0] = from[pi - 2];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 2]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 2], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ from += pi_limit;
+ }
+ return frame->n_vectors;
+}
+#endif
+
+#if NSTAGES == 4
+static STAGE_INLINE uword
+dispatch_pipeline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index, next0;
+ int pi, pi_limit;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ pi_limit = clib_min (n_left_from, n_left_to_next);
+
+ for (pi = 0; pi < NSTAGES-1; pi++)
+ {
+ if(pi == pi_limit)
+ break;
+ stage0 (vm, node, from[pi]);
+ if (pi-1 >= 0)
+ stage1 (vm, node, from[pi-1]);
+ if (pi-2 >= 0)
+ stage2 (vm, node, from[pi-2]);
+ }
+
+ for (; pi < pi_limit; pi++)
+ {
+ stage0 (vm, node, from[pi]);
+ stage1 (vm, node, from[pi-1]);
+ stage2 (vm, node, from[pi-2]);
+ to_next[0] = from [pi - 3];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 3]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 3], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+
+
+ for (; pi < (pi_limit + (NSTAGES-1)); pi++)
+ {
+ if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
+ stage1 (vm, node, from[pi-1]);
+ if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
+ stage2 (vm, node, from[pi-2]);
+ if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
+ {
+ to_next[0] = from[pi - 3];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 3]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 3], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ from += pi_limit;
+ }
+ return frame->n_vectors;
+}
+#endif
+
+
+#if NSTAGES == 5
+static STAGE_INLINE uword
+dispatch_pipeline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index, next0;
+ int pi, pi_limit;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ pi_limit = clib_min (n_left_from, n_left_to_next);
+
+ for (pi = 0; pi < NSTAGES-1; pi++)
+ {
+ if(pi == pi_limit)
+ break;
+ stage0 (vm, node, from[pi]);
+ if (pi-1 >= 0)
+ stage1 (vm, node, from[pi-1]);
+ if (pi-2 >= 0)
+ stage2 (vm, node, from[pi-2]);
+ if (pi-3 >= 0)
+ stage3 (vm, node, from[pi-3]);
+ }
+
+ for (; pi < pi_limit; pi++)
+ {
+ stage0 (vm, node, from[pi]);
+ stage1 (vm, node, from[pi-1]);
+ stage2 (vm, node, from[pi-2]);
+ stage3 (vm, node, from[pi-3]);
+ to_next[0] = from [pi - 4];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 4]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 4], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+
+
+ for (; pi < (pi_limit + (NSTAGES-1)); pi++)
+ {
+ if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
+ stage1 (vm, node, from[pi-1]);
+ if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
+ stage2 (vm, node, from[pi - 2]);
+ if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
+ stage3 (vm, node, from[pi - 3]);
+ if (((pi - 4) >= 0) && ((pi - 4) < pi_limit))
+ {
+ to_next[0] = from[pi - 4];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 4]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 4], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ from += pi_limit;
+ }
+ return frame->n_vectors;
+}
+#endif
+
+#if NSTAGES == 6
+static STAGE_INLINE uword
+dispatch_pipeline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * from = vlib_frame_vector_args (frame);
+ u32 n_left_from, n_left_to_next, * to_next, next_index, next0;
+ int pi, pi_limit;
+
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ pi_limit = clib_min (n_left_from, n_left_to_next);
+
+ for (pi = 0; pi < NSTAGES-1; pi++)
+ {
+ if(pi == pi_limit)
+ break;
+ stage0 (vm, node, from[pi]);
+ if (pi-1 >= 0)
+ stage1 (vm, node, from[pi-1]);
+ if (pi-2 >= 0)
+ stage2 (vm, node, from[pi-2]);
+ if (pi-3 >= 0)
+ stage3 (vm, node, from[pi-3]);
+ if (pi-4 >= 0)
+ stage4 (vm, node, from[pi-4]);
+ }
+
+ for (; pi < pi_limit; pi++)
+ {
+ stage0 (vm, node, from[pi]);
+ stage1 (vm, node, from[pi-1]);
+ stage2 (vm, node, from[pi-2]);
+ stage3 (vm, node, from[pi-3]);
+ stage4 (vm, node, from[pi-4]);
+ to_next[0] = from [pi - 5];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 5]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 5], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+
+
+ for (; pi < (pi_limit + (NSTAGES-1)); pi++)
+ {
+ if (((pi - 1) >= 0) && ((pi - 1) < pi_limit))
+ stage1 (vm, node, from[pi-1]);
+ if (((pi - 2) >= 0) && ((pi - 2) < pi_limit))
+ stage2 (vm, node, from[pi - 2]);
+ if (((pi - 3) >= 0) && ((pi - 3) < pi_limit))
+ stage3 (vm, node, from[pi - 3]);
+ if (((pi - 4) >= 0) && ((pi - 4) < pi_limit))
+ stage4 (vm, node, from[pi - 4]);
+ if (((pi - 5) >= 0) && ((pi - 5) < pi_limit))
+ {
+ to_next[0] = from[pi - 5];
+ to_next++;
+ n_left_to_next--;
+ next0 = last_stage (vm, node, from [pi - 5]);
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ from[pi - 5], next0);
+ n_left_from--;
+ if ((int) n_left_to_next < 0 && n_left_from > 0)
+ vlib_get_next_frame (vm, node, next_index, to_next,
+ n_left_to_next);
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ from += pi_limit;
+ }
+ return frame->n_vectors;
+}
+#endif
diff --git a/vnet/vnet/plugin/p1.c b/vnet/vnet/plugin/p1.c
new file mode 100644
index 00000000000..6ede7938ee9
--- /dev/null
+++ b/vnet/vnet/plugin/p1.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * This file and in fact the entire directory shouldn't even exist.
+ * Vnet ought to be a dynamic library.
+
+ * Unfortunately, various things malfunction when we try to go there.
+ * Plugin DLL's end up with their own copies of critical
+ * data structures. No one of these problems would be tough to fix,
+ * but there are quite a number of them.
+ */
+
+/*
+ * Make certain that plugin .dll's which reference the following functions
+ * can find them...
+ */
+
+#define foreach_plugin_reference \
+_(unformat_vnet_hw_interface) \
+_(unformat_vnet_sw_interface) \
+_(vnet_hw_interface_rx_redirect_to_node) \
+_(vnet_config_add_feature) \
+_(vnet_config_del_feature) \
+_(vnet_get_main) \
+_(_vlib_init_function_l2_init) \
+_(_vlib_init_function_pg_init) \
+_(_vlib_init_function_ip_main_init) \
+_(_vlib_init_function_ethernet_init) \
+_(_vlib_init_function_ethernet_arp_init) \
+_(l2input_intf_bitmap_enable) \
+_(ip4_main) \
+_(ip6_main) \
+_(format_ip4_address) \
+_(unformat_ip4_address) \
+_(ip4_address_compare) \
+_(ip6_address_compare) \
+_(format_ip6_address) \
+_(format_ip6_address_and_length) \
+_(udp_register_dst_port) \
+_(ethernet_register_input_type) \
+_(ethernet_set_flags) \
+_(vnet_register_sr_app_callback) \
+_(format_ip6_sr_header) \
+_(format_ip6_address) \
+_(unformat_ip6_address) \
+_(ip6_main) \
+_(find_ip6_fib_by_table_index_or_id) \
+_(format_ethernet_address) \
+_(unformat_ethernet_address) \
+_(unformat_ethernet_interface) \
+_(ethernet_register_l2_input) \
+_(ethernet_register_l3_redirect) \
+_(unformat_pg_payload) \
+_(format_ip4_address_and_length) \
+_(ip_incremental_checksum) \
+_(ethernet_sw_interface_set_l2_mode) \
+_(vnet_create_loopback_interface) \
+_(ethernet_set_rx_redirect) \
+_(ethernet_set_flags) \
+_(ethernet_get_main) \
+_(ethernet_get_interface) \
+_(vnet_hw_interface_set_flags) \
+_(vnet_sw_interface_set_flags) \
+_(vnet_create_sw_interface) \
+_(vnet_delete_sw_interface) \
+_(vnet_get_main) \
+_(pg_stream_add) \
+_(pg_stream_del) \
+_(pg_stream_enable_disable) \
+_(pg_main)
+
+#if DPDK > 0
+#define foreach_dpdk_plugin_reference \
+_(dpdk_set_next_node) \
+_(dpdk_worker_thread) \
+_(dpdk_io_thread) \
+_(dpdk_frame_queue_dequeue) \
+_(vlib_get_handoff_queue_elt) \
+_(dpdk_get_handoff_node_index) \
+_(dpdk_set_flowcontrol_callback) \
+_(dpdk_interface_tx_vector) \
+_(rte_calloc) \
+_(rte_free) \
+_(rte_malloc) \
+_(post_sw_interface_set_flags) \
+_(dpdk_get_admin_up_down_in_progress) \
+_(efd_config)
+#else
+#define foreach_dpdk_plugin_reference
+#endif
+
+#define _(a) void a (void);
+foreach_plugin_reference
+foreach_dpdk_plugin_reference
+#undef _
+
+void *vnet_library_plugin_references[] =
+ {
+#define _(a) &a,
+ foreach_plugin_reference
+ foreach_dpdk_plugin_reference
+#undef _
+ };
+
+void vnet_library_plugin_reference(void) { }
diff --git a/vnet/vnet/plugin/plugin.h b/vnet/vnet/plugin/plugin.h
new file mode 100644
index 00000000000..a14a5932b50
--- /dev/null
+++ b/vnet/vnet/plugin/plugin.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_plugin_h
+#define included_vnet_plugin_h
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vppinfra/error.h>
+
+/* Pointers to Genuine Vnet data structures handed to plugin .dll's */
+typedef struct {
+ vnet_main_t * vnet_main;
+ ethernet_main_t * ethernet_main;
+} vnet_plugin_handoff_t;
+
+void * vnet_get_handoff_structure (void);
+
+#endif /* included_vnet_plugin_h */
diff --git a/vnet/vnet/policer/fix_types.h b/vnet/vnet/policer/fix_types.h
new file mode 100644
index 00000000000..cbb79e049fc
--- /dev/null
+++ b/vnet/vnet/policer/fix_types.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_fix_types_h__
+#define __included_fix_types_h__
+
+/* deal with various imported type choices */
+#define cerrno int
+#define trans_layer_rc int
+#define EOK 0
+#define CERR_IS_NOTOK(a) (a != EOK)
+#define PACKED
+#define OK_pushHW EOK
+#define Not_OK (-1)
+
+typedef unsigned char uint8_t;
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+#endif /* __included_fix_types_h__ */
diff --git a/vnet/vnet/policer/node_funcs.c b/vnet/vnet/policer/node_funcs.c
new file mode 100644
index 00000000000..3badcfdfd1f
--- /dev/null
+++ b/vnet/vnet/policer/node_funcs.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/policer/policer.h>
+
+/* Dispatch functions meant to be instantiated elsewhere */
+
+typedef struct {
+ u32 next_index;
+ u32 sw_if_index;
+ u32 policer_index;
+} vnet_policer_trace_t;
+
+/* packet trace format function */
+static u8 * format_policer_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ vnet_policer_trace_t * t = va_arg (*args, vnet_policer_trace_t *);
+
+ s = format (s, "VNET_POLICER: sw_if_index %d policer_index %d next %d",
+ t->sw_if_index, t->policer_index, t->next_index);
+ return s;
+}
+
+#define foreach_vnet_policer_error \
+_(TRANSMIT, "Packets Transmitted") \
+_(DROP, "Packets Dropped")
+
+typedef enum {
+#define _(sym,str) VNET_POLICER_ERROR_##sym,
+ foreach_vnet_policer_error
+#undef _
+ VNET_POLICER_N_ERROR,
+} vnet_policer_error_t;
+
+static char * vnet_policer_error_strings[] = {
+#define _(sym,string) string,
+ foreach_vnet_policer_error
+#undef _
+};
+
+static inline
+uword vnet_policer_inline (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame,
+ vnet_policer_index_t which)
+{
+ u32 n_left_from, * from, * to_next;
+ vnet_policer_next_t next_index;
+ vnet_policer_main_t * pm = &vnet_policer_main;
+ u64 time_in_policer_periods;
+ u32 transmitted = 0;
+
+ time_in_policer_periods =
+ clib_cpu_time_now() >> POLICER_TICKS_PER_PERIOD_SHIFT;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+ u32 pi0, pi1;
+ u32 len0, len1;
+ u32 col0, col1;
+ policer_read_response_type_st * pol0, * pol1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = VNET_POLICER_NEXT_TRANSMIT;
+
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ next1 = VNET_POLICER_NEXT_TRANSMIT;
+
+
+ if (which == VNET_POLICER_INDEX_BY_SW_IF_INDEX)
+ {
+ pi0 = pm->policer_index_by_sw_if_index[sw_if_index0];
+ pi1 = pm->policer_index_by_sw_if_index[sw_if_index1];
+ }
+
+ if (which == VNET_POLICER_INDEX_BY_OPAQUE)
+ {
+ pi0 = vnet_buffer(b0)->policer.index;
+ pi1 = vnet_buffer(b1)->policer.index;
+ }
+
+ if (which == VNET_POLICER_INDEX_BY_EITHER)
+ {
+ pi0 = vnet_buffer(b0)->policer.index;
+ pi0 = (pi0 != ~0) ? pi0 :
+ pm->policer_index_by_sw_if_index [sw_if_index0];
+ pi1 = vnet_buffer(b1)->policer.index;
+ pi1 = (pi1 != ~0) ? pi1 :
+ pm->policer_index_by_sw_if_index [sw_if_index1];
+ }
+
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+ pol0 = &pm->policers [pi0];
+ col0 = vnet_police_packet (pol0, len0,
+ POLICE_CONFORM /* no chaining */,
+ time_in_policer_periods);
+
+ len1 = vlib_buffer_length_in_chain (vm, b1);
+ pol1 = &pm->policers [pi1];
+ col1 = vnet_police_packet (pol1, len1,
+ POLICE_CONFORM /* no chaining */,
+ time_in_policer_periods);
+
+ if (PREDICT_FALSE(col0 > 0))
+ {
+ next0 = VNET_POLICER_NEXT_DROP;
+ b0->error = node->errors[VNET_POLICER_ERROR_DROP];
+ }
+ else
+ transmitted++;
+
+ if (PREDICT_FALSE(col1 > 0))
+ {
+ next1 = VNET_POLICER_NEXT_DROP;
+ b1->error = node->errors[VNET_POLICER_ERROR_DROP];
+ }
+ else
+ transmitted++;
+
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ vnet_policer_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ vnet_policer_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ u32 pi0;
+ u32 len0;
+ u32 col0;
+ policer_read_response_type_st * pol0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = VNET_POLICER_NEXT_TRANSMIT;
+
+ if (which == VNET_POLICER_INDEX_BY_SW_IF_INDEX)
+ pi0 = pm->policer_index_by_sw_if_index[sw_if_index0];
+
+ if (which == VNET_POLICER_INDEX_BY_OPAQUE)
+ pi0 = vnet_buffer(b0)->policer.index;
+
+ if (which == VNET_POLICER_INDEX_BY_EITHER)
+ {
+ pi0 = vnet_buffer(b0)->policer.index;
+ pi0 = (pi0 != ~0) ? pi0 :
+ pm->policer_index_by_sw_if_index [sw_if_index0];
+ }
+
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+ pol0 = &pm->policers [pi0];
+ col0 = vnet_police_packet (pol0, len0,
+ POLICE_CONFORM /* no chaining */,
+ time_in_policer_periods);
+
+ if (PREDICT_FALSE(col0 > 0))
+ {
+ next0 = VNET_POLICER_NEXT_DROP;
+ b0->error = node->errors[VNET_POLICER_ERROR_DROP];
+ }
+ else
+ {
+ transmitted++;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED)))
+ {
+ vnet_policer_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->policer_index = pi0;
+ }
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ vlib_node_increment_counter (vm, node->node_index,
+ VNET_POLICER_ERROR_TRANSMIT,
+ transmitted);
+ return frame->n_vectors;
+}
+
+uword vnet_policer_by_sw_if_index (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return vnet_policer_inline (vm, node, frame,
+ VNET_POLICER_INDEX_BY_SW_IF_INDEX);
+}
+
+uword vnet_policer_by_opaque (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return vnet_policer_inline (vm, node, frame,
+ VNET_POLICER_INDEX_BY_OPAQUE);
+}
+
+uword vnet_policer_by_either (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return vnet_policer_inline (vm, node, frame,
+ VNET_POLICER_INDEX_BY_EITHER);
+}
+
+void vnet_policer_node_funcs_reference (void) { }
+
+
+#define TEST_CODE 1
+
+#ifdef TEST_CODE
+
+VLIB_REGISTER_NODE (policer_by_sw_if_index_node, static) = {
+ .function = vnet_policer_by_sw_if_index,
+ .name = "policer-by-sw-if-index",
+ .vector_size = sizeof (u32),
+ .format_trace = format_policer_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(vnet_policer_error_strings),
+ .error_strings = vnet_policer_error_strings,
+
+ .n_next_nodes = VNET_POLICER_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [VNET_POLICER_NEXT_TRANSMIT] = "ethernet-input",
+ [VNET_POLICER_NEXT_DROP] = "error-drop",
+ },
+};
+
+
+int test_policer_add_del (u32 rx_sw_if_index, u8 *config_name,
+ int is_add)
+{
+ vnet_policer_main_t * pm = &vnet_policer_main;
+ policer_read_response_type_st * template;
+ policer_read_response_type_st * policer;
+ vnet_hw_interface_t * rxhi;
+ uword * p;
+
+ rxhi = vnet_get_sup_hw_interface (pm->vnet_main, rx_sw_if_index);
+
+ /* Make sure caller didn't pass a vlan subif, etc. */
+ if (rxhi->sw_if_index != rx_sw_if_index)
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+
+ if (is_add)
+ {
+
+ p = hash_get_mem (pm->policer_config_by_name, config_name);
+
+ if (p == 0)
+ return -2;
+
+ template = pool_elt_at_index (pm->policer_templates, p[0]);
+
+ vnet_hw_interface_rx_redirect_to_node
+ (pm->vnet_main,
+ rxhi->hw_if_index,
+ policer_by_sw_if_index_node.index);
+
+ pool_get_aligned (pm->policers, policer, CLIB_CACHE_LINE_BYTES);
+
+ policer[0] = template[0];
+
+ vec_validate (pm->policer_index_by_sw_if_index, rx_sw_if_index);
+ pm->policer_index_by_sw_if_index[rx_sw_if_index]
+ = policer - pm->policers;
+ }
+ else
+ {
+ u32 pi;
+ vnet_hw_interface_rx_redirect_to_node (pm->vnet_main,
+ rxhi->hw_if_index,
+ ~0 /* disable */);
+
+ pi = pm->policer_index_by_sw_if_index[rx_sw_if_index];
+ pm->policer_index_by_sw_if_index[rx_sw_if_index] = ~0;
+ pool_put_index (pm->policers, pi);
+ }
+
+ return 0;
+}
+
+static clib_error_t *
+test_policer_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_policer_main_t * pm = &vnet_policer_main;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ u32 rx_sw_if_index;
+ int rv;
+ u8 * config_name = 0;
+ int rx_set = 0;
+ int is_add = 1;
+ int is_show = 0;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "intfc %U", unformat_vnet_sw_interface,
+ pm->vnet_main, &rx_sw_if_index))
+ rx_set = 1;
+ else if (unformat (line_input, "show"))
+ is_show=1;
+ else if (unformat (line_input, "policer %s", &config_name))
+ ;
+ else if (unformat (line_input, "del"))
+ is_add = 0;
+ else break;
+ }
+
+ if (rx_set == 0)
+ return clib_error_return (0, "interface not set");
+
+ if (is_show)
+ {
+ u32 pi = pm->policer_index_by_sw_if_index[rx_sw_if_index];
+ policer_read_response_type_st * policer;
+ policer = pool_elt_at_index (pm->policers, pi);
+
+ vlib_cli_output (vm, "%U", format_policer_instance, policer);
+ return 0;
+ }
+
+ if (is_add && config_name == 0)
+ {
+ return clib_error_return (0, "policer config name required");
+ }
+
+ rv = test_policer_add_del (rx_sw_if_index, config_name, is_add);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ default:
+ return clib_error_return
+ (0, "WARNING: vnet_vnet_policer_add_del returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_patch_command, static) = {
+ .path = "test policer",
+ .short_help =
+ "intfc <intfc> policer <policer-config-name> [del]",
+ .function = test_policer_command_fn,
+};
+
+
+#endif /* TEST_CODE */
diff --git a/vnet/vnet/policer/police.h b/vnet/vnet/policer/police.h
new file mode 100644
index 00000000000..d2e369c655f
--- /dev/null
+++ b/vnet/vnet/policer/police.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __POLICE_H__
+#define __POLICE_H__
+
+typedef enum {
+ POLICE_CONFORM = 0,
+ POLICE_EXCEED = 1,
+ POLICE_VIOLATE = 2,
+} policer_result_e;
+
+// This is the hardware representation of the policer.
+// To be multithread-safe, the policer is accessed through a spin-lock
+// on the lock field. (For a policer update operation, 24B needs to be
+// modified and this would be a challenge to do with atomic instructions.)
+// The structure is padded so that no other data is put into the same
+// 64B cache-line. This reduces cache-thrashing between threads.
+//
+// A note on scale:
+// The HW TSC tick is roughly one CPU clock cycle.
+// This is shifted to create a larger period, with a goal to be around 50usec.
+// The period time will vary based on CPU clock speed.
+// CPU speeds of 1Ghz to 8Ghz are targetted.
+// The shift amount is a constant 17 bits, resulting in a period between
+// 16usec (8Ghz CPU) and 131usec (1Ghz CPU).
+// The token_per_period computation takes into account the clock speed.
+//
+// The 32-bit bucket/limit supports about 850ms of burst on a 40GE port,
+// or 340ms on a 100GE port. If a larger burst is configued, then the
+// programmed value is simply capped at 2^32-1. If we needed to support
+// more than that, the bucket and limit fields could be expanded.
+//
+// tokens_per_period should be > 1000 to support 0.1% granularity.
+// To support lower rates (which would not meet this requirement), the packet
+// length, bucket, and limit values can be scaled. The scale is a power of 2
+// so the multiplication can be implemented as a shift. The control plane
+// computes the shift amount be the largest possible that still supports the
+// burst size. This makes the rate accuracy as high as possible.
+//
+// The 64-bit last_update_time supports a 4Ghz CPU without rollover for 100 years
+//
+// The lock field should be used for a spin-lock on the struct.
+
+#define POLICER_TICKS_PER_PERIOD_SHIFT 17
+#define POLICER_TICKS_PER_PERIOD (1 << POLICER_TICKS_PER_PERIOD_SHIFT)
+
+typedef struct {
+
+ uint32_t lock; // for exclusive access to the struct
+
+ uint32_t single_rate; // 1 = single rate policer, 0 = two rate policer
+ uint32_t color_aware; // for hierarchical policing
+ uint32_t scale; // power-of-2 shift amount for lower rates
+ uint32_t pad[2];
+
+ // Fields are marked as 2R if they are only used for a 2-rate policer,
+ // and MOD if they are modified as part of the update operation.
+ // 1 token = 1 byte.
+
+ uint32_t cir_tokens_per_period; // # of tokens for each period
+ uint32_t pir_tokens_per_period; // 2R
+
+ uint32_t current_limit;
+ uint32_t current_bucket; // MOD
+ uint32_t extended_limit;
+ uint32_t extended_bucket; // MOD
+
+ uint64_t last_update_time; // MOD
+ uint64_t pad64;
+
+} policer_read_response_type_st;
+
+static inline policer_result_e
+vnet_police_packet (policer_read_response_type_st *policer,
+ uint32_t packet_length,
+ policer_result_e packet_color,
+ uint64_t time)
+{
+ uint64_t n_periods;
+ uint64_t current_tokens, extended_tokens;
+ policer_result_e result;
+
+ // Scale packet length to support a wide range of speeds
+ packet_length = packet_length << policer->scale;
+
+ // Compute the number of policer periods that have passed since the last
+ // operation.
+ n_periods = time - policer->last_update_time;
+ policer->last_update_time = time;
+
+ // Since there is no background last-update-time adjustment, n_periods
+ // could grow large if the policer is idle for a long time. This could
+ // cause a 64-bit overflow when computing tokens_per_period * num_periods.
+ // It will overflow if log2(n_periods) + log2(tokens_per_period) > 64.
+ //
+ // To mitigate this, the policer configuration algorithm insures that
+ // tokens_per_period is less than 2^22, i.e. this is a 22 bit value not
+ // a 32-bit value. Thus overflow will only occur if n_periods > 64-22 or
+ // 42. 2^42 min-sized periods is 16us * 2^42, or 2 years. So this can
+ // rarely occur. If overflow does happen, the only effect will be that
+ // fewer tokens than the max burst will be added to the bucket for this
+ // packet. This constraint on tokens_per_period lets the ucode omit
+ // code to dynamically check for or prevent the overflow.
+
+ if (policer->single_rate) {
+
+ // Compute number of tokens for this time period
+ current_tokens = policer->current_bucket + n_periods * policer->cir_tokens_per_period;
+ if (current_tokens > policer->current_limit) {
+ current_tokens = policer->current_limit;
+ }
+
+ extended_tokens = policer->extended_bucket + n_periods * policer->cir_tokens_per_period;
+ if (extended_tokens > policer->extended_limit) {
+ extended_tokens = policer->extended_limit;
+ }
+
+ // Determine color
+
+ if ((!policer->color_aware || (packet_color == POLICE_CONFORM)) && (current_tokens >= packet_length)) {
+ policer->current_bucket = current_tokens - packet_length;
+ policer->extended_bucket = extended_tokens - packet_length;
+ result = POLICE_CONFORM;
+ } else if ((!policer->color_aware || (packet_color != POLICE_VIOLATE)) && (extended_tokens >= packet_length)) {
+ policer->current_bucket = current_tokens;
+ policer->extended_bucket = extended_tokens - packet_length;
+ result = POLICE_EXCEED;
+ } else {
+ policer->current_bucket = current_tokens;
+ policer->extended_bucket = extended_tokens;
+ result = POLICE_VIOLATE;
+ }
+
+ } else {
+ // Two-rate policer
+
+ // Compute number of tokens for this time period
+ current_tokens = policer->current_bucket + n_periods * policer->cir_tokens_per_period;
+ extended_tokens = policer->extended_bucket + n_periods * policer->pir_tokens_per_period;
+ if (current_tokens > policer->current_limit) {
+ current_tokens = policer->current_limit;
+ }
+ if (extended_tokens > policer->extended_limit) {
+ extended_tokens = policer->extended_limit;
+ }
+
+ // Determine color
+
+ if ((policer->color_aware && (packet_color == POLICE_VIOLATE)) || (extended_tokens < packet_length)) {
+ policer->current_bucket = current_tokens;
+ policer->extended_bucket = extended_tokens;
+ result = POLICE_VIOLATE;
+ } else if ((policer->color_aware && (packet_color == POLICE_EXCEED)) || (current_tokens < packet_length)) {
+ policer->current_bucket = current_tokens;
+ policer->extended_bucket = extended_tokens - packet_length;
+ result = POLICE_EXCEED;
+ } else {
+ policer->current_bucket = current_tokens - packet_length;
+ policer->extended_bucket = extended_tokens - packet_length;
+ result = POLICE_CONFORM;
+ }
+ }
+ return result;
+}
+
+#endif // __POLICE_H__
diff --git a/vnet/vnet/policer/policer.c b/vnet/vnet/policer/policer.c
new file mode 100644
index 00000000000..310c5f59f20
--- /dev/null
+++ b/vnet/vnet/policer/policer.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/policer/policer.h>
+
+u8 * format_policer_instance (u8 * s, va_list * va)
+{
+ policer_read_response_type_st * i
+ = va_arg (*va, policer_read_response_type_st *);
+
+ s = format (s, "policer at %llx: %s rate, %s color-aware\n",
+ i, i->single_rate ? "single" : "dual",
+ i->color_aware ? "is" : "not");
+ s = format (s, "cir %u tok/period, pir %u tok/period, scale %u\n",
+ i->cir_tokens_per_period, i->pir_tokens_per_period,
+ i->scale);
+ s = format (s, "cur lim %u, cur bkt %u, ext lim %u, ext bkt %u\n",
+ i->current_limit,
+ i->current_bucket,
+ i->extended_limit,
+ i->extended_bucket);
+ s = format (s, "last update %llu\n", i->last_update_time);
+ return s;
+}
+
+static u8 * format_policer_round_type (u8 * s, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (c->rnd_type == SSE2_QOS_ROUND_TO_CLOSEST)
+ s = format(s, "closest");
+ else if (c->rnd_type == SSE2_QOS_ROUND_TO_UP)
+ s = format (s, "up");
+ else if (c->rnd_type == SSE2_QOS_ROUND_TO_DOWN)
+ s = format (s, "down");
+ else
+ s = format (s, "ILLEGAL");
+ return s;
+}
+
+
+static u8 * format_policer_rate_type (u8 * s, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (c->rate_type == SSE2_QOS_RATE_KBPS)
+ s = format (s, "kbps");
+ else if (c->rate_type == SSE2_QOS_RATE_PPS)
+ s = format(s, "pps");
+ else
+ s = format (s, "ILLEGAL");
+ return s;
+}
+
+static u8 * format_policer_type (u8 * s, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (c->rfc == SSE2_QOS_POLICER_TYPE_1R2C)
+ s = format (s, "1r2c");
+
+ else if (c->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697)
+ s = format (s, "1r3c");
+
+ else if (c->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698)
+ s = format (s, "2r3c-2698");
+
+ else if (c->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115)
+ s = format (s, "2r3c-4115");
+
+ else if (c->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_MEF5CF1)
+ s = format (s, "2r3c-mef5cf1");
+ else
+ s = format (s, "ILLEGAL");
+ return s;
+}
+
+u8 * format_policer_config (u8 * s, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ s = format (s, "type %U cir %u eir %u cb %u eb %u\n",
+ format_policer_type, c,
+ c->rb.kbps.cir_kbps,
+ c->rb.kbps.eir_kbps,
+ c->rb.kbps.cb_bytes,
+ c->rb.kbps.eb_bytes);
+ s = format (s, "rate type %U, round type %U\n",
+ format_policer_rate_type, c,
+ format_policer_round_type, c);
+ return s;
+}
+
+static uword
+unformat_policer_type (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (!unformat (input, "type"))
+ return 0;
+
+ if (unformat (input, "1r2c"))
+ c->rfc = SSE2_QOS_POLICER_TYPE_1R2C;
+ else if (unformat (input, "1r3c"))
+ c->rfc = SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697;
+ else if (unformat (input, "2r3c-2698"))
+ c->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698;
+ else if (unformat (input, "2r3c-4115"))
+ c->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115;
+ else if (unformat (input, "2r3c-mef5cf1"))
+ c->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_MEF5CF1;
+ else
+ return 0;
+ return 1;
+}
+
+static uword
+unformat_policer_round_type (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (!unformat(input, "round"))
+ return 0;
+
+ if (unformat(input, "closest"))
+ c->rnd_type = SSE2_QOS_ROUND_TO_CLOSEST;
+ else if (unformat (input, "up"))
+ c->rnd_type = SSE2_QOS_ROUND_TO_UP;
+ else if (unformat (input, "down"))
+ c->rnd_type = SSE2_QOS_ROUND_TO_DOWN;
+ else
+ return 0;
+ return 1;
+}
+
+static uword
+unformat_policer_rate_type (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (!unformat(input, "rate"))
+ return 0;
+
+ if (unformat (input, "kbps"))
+ c->rate_type = SSE2_QOS_RATE_KBPS;
+ else if (unformat(input, "pps"))
+ c->rate_type = SSE2_QOS_RATE_PPS;
+ else
+ return 0;
+ return 1;
+}
+
+static uword
+unformat_policer_cir (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (unformat (input, "cir %u", &c->rb.kbps.cir_kbps))
+ return 1;
+ return 0;
+}
+
+static uword
+unformat_policer_eir (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (unformat (input, "eir %u", &c->rb.kbps.eir_kbps))
+ return 1;
+ return 0;
+}
+
+static uword
+unformat_policer_cb (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (unformat (input, "cb %u", &c->rb.kbps.cb_bytes))
+ return 1;
+ return 0;
+}
+
+static uword
+unformat_policer_eb (unformat_input_t * input, va_list * va)
+{
+ sse2_qos_pol_cfg_params_st * c
+ = va_arg (*va, sse2_qos_pol_cfg_params_st *);
+
+ if (unformat (input, "eb %u", &c->rb.kbps.eb_bytes))
+ return 1;
+ return 0;
+}
+
+
+#define foreach_config_param \
+_(eb) \
+_(cb) \
+_(eir) \
+_(cir) \
+_(rate_type) \
+_(round_type) \
+_(type)
+
+static clib_error_t *
+configure_policer_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_policer_main_t *pm = &vnet_policer_main;
+ sse2_qos_pol_cfg_params_st c;
+ policer_read_response_type_st test_policer;
+ unformat_input_t _line_input, * line_input = &_line_input;
+ int is_add = 1;
+ int rv;
+ u8 * name = 0;
+ uword * p;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ memset (&c, 0, sizeof (c));
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat(line_input, "name %s", &name))
+ ;
+
+#define _(a) else if (unformat (line_input, "%U", unformat_policer_##a, &c)) ;
+ foreach_config_param
+#undef _
+
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (is_add == 0)
+ {
+ p = hash_get_mem (pm->policer_config_by_name, name);
+ if (p == 0)
+ {
+ vec_free(name);
+ return clib_error_return (0, "No such policer configuration");
+ }
+ hash_unset_mem (pm->policer_config_by_name, name);
+ vec_free(name);
+ return 0;
+ }
+
+ /* Vet the configuration before adding it to the table */
+ rv = sse2_pol_logical_2_physical (&c, &test_policer);
+
+ if (rv == 0)
+ {
+ policer_read_response_type_st *pp;
+ sse2_qos_pol_cfg_params_st *cp;
+
+ pool_get (pm->configs, cp);
+ pool_get (pm->policer_templates, pp);
+
+ ASSERT (cp - pm->configs == pp - pm->policer_templates);
+
+ memcpy (cp, &c, sizeof (*cp));
+ memcpy (pp, &test_policer, sizeof (*pp));
+
+ hash_set_mem (pm->policer_config_by_name, name, cp - pm->configs);
+ }
+ else
+ {
+ vec_free (name);
+ return clib_error_return (0, "Config failed sanity check");
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (configure_policer_command, static) = {
+ .path = "configure policer",
+ .short_help = "configure policer name <name> <params> ",
+ .function = configure_policer_command_fn,
+};
+
+
+static clib_error_t *
+show_policer_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_policer_main_t *pm = &vnet_policer_main;
+ hash_pair_t * p;
+ u32 pool_index;
+ u8 * match_name = 0;
+ u8 * name;
+ sse2_qos_pol_cfg_params_st *config;
+ policer_read_response_type_st *templ;
+
+ (void) unformat (input, "name %s", &match_name);
+
+ hash_foreach_pair (p, pm->policer_config_by_name,
+ ({
+ name = (u8 *) p->key;
+ if (match_name == 0 || !strcmp((char *) name, (char *) match_name))
+ {
+ pool_index = p->value[0];
+ config = pool_elt_at_index (pm->configs, pool_index);
+ templ = pool_elt_at_index (pm->policer_templates, pool_index);
+ vlib_cli_output (vm, "Name \"%s\" %U ",
+ name, format_policer_config, config);
+ vlib_cli_output (vm, "Template %U",
+ format_policer_instance, templ);
+ vlib_cli_output (vm, "-----------");
+ }
+ }));
+ return 0;
+}
+
+
+VLIB_CLI_COMMAND (show_policer_command, static) = {
+ .path = "show policer",
+ .short_help = "show policer [name]",
+ .function = show_policer_command_fn,
+};
+
+clib_error_t *policer_init (vlib_main_t * vm)
+{
+ vnet_policer_main_t * pm = &vnet_policer_main;
+ void vnet_policer_node_funcs_reference(void);
+
+ vnet_policer_node_funcs_reference();
+
+ pm->vlib_main = vm;
+ pm->vnet_main = vnet_get_main();
+
+ pm->policer_config_by_name = hash_create_string (0, sizeof (uword));
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(policer_init);
+
+
diff --git a/vnet/vnet/policer/policer.h b/vnet/vnet/policer/policer.h
new file mode 100644
index 00000000000..f91521e2268
--- /dev/null
+++ b/vnet/vnet/policer/policer.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_policer_h__
+#define __included_policer_h__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include <vnet/policer/xlate.h>
+#include <vnet/policer/police.h>
+
+typedef struct {
+ /* policer pool, aligned */
+ policer_read_response_type_st * policers;
+
+ /* config + template h/w policer instance parallel pools */
+ sse2_qos_pol_cfg_params_st * configs;
+ policer_read_response_type_st * policer_templates;
+
+ /* Config by name hash */
+ uword * policer_config_by_name;
+
+ /* Policer by sw_if_index vector */
+ u32 * policer_index_by_sw_if_index;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} vnet_policer_main_t;
+
+vnet_policer_main_t vnet_policer_main;
+
+typedef enum {
+ VNET_POLICER_INDEX_BY_SW_IF_INDEX,
+ VNET_POLICER_INDEX_BY_OPAQUE,
+ VNET_POLICER_INDEX_BY_EITHER,
+} vnet_policer_index_t;
+
+typedef
+enum {
+ VNET_POLICER_NEXT_TRANSMIT,
+ VNET_POLICER_NEXT_DROP,
+ VNET_POLICER_N_NEXT,
+} vnet_policer_next_t;
+
+u8 * format_policer_instance (u8 * s, va_list * va);
+
+#endif /* __included_policer_h__ */
diff --git a/vnet/vnet/policer/xlate.c b/vnet/vnet/policer/xlate.c
new file mode 100644
index 00000000000..761e8214413
--- /dev/null
+++ b/vnet/vnet/policer/xlate.c
@@ -0,0 +1,1380 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include <vnet/policer/xlate.h>
+#include <vnet/policer/police.h>
+
+#define INTERNAL_SS 1
+
+/* debugs */
+#define SSE2_QOS_DEBUG_ERROR(msg, args...) \
+ fformat(stderr, msg "\n", ##args);
+
+#define SSE2_QOS_DEBUG_INFO(msg, args...) \
+ fformat(stderr, msg "\n", ##args);
+
+
+#define SSE2_QOS_TR_ERR(TpParms...)
+// {
+// }
+
+#define SSE2_QOS_TR_INFO(TpParms...)
+
+#ifndef MIN
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#endif
+
+#ifndef MAX
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#endif
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AH_OFFSET 0
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AH_MASK 8
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AH_SHIFT 24
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_TYPE_OFFSET 2
+#define IPE_POLICER_FULL_WRITE_REQUEST_TYPE_MASK 2
+#define IPE_POLICER_FULL_WRITE_REQUEST_TYPE_SHIFT 10
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_CMD_OFFSET 3
+#define IPE_POLICER_FULL_WRITE_REQUEST_CMD_MASK 2
+#define IPE_POLICER_FULL_WRITE_REQUEST_CMD_SHIFT 0
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AL_OFFSET 4
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AL_MASK 32
+#define IPE_POLICER_FULL_WRITE_REQUEST_M40AL_SHIFT 0
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_RFC_OFFSET 8
+#define IPE_POLICER_FULL_WRITE_REQUEST_RFC_MASK 2
+#define IPE_POLICER_FULL_WRITE_REQUEST_RFC_SHIFT 30
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_AN_OFFSET 8
+#define IPE_POLICER_FULL_WRITE_REQUEST_AN_MASK 1
+#define IPE_POLICER_FULL_WRITE_REQUEST_AN_SHIFT 29
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_REXP_OFFSET 8
+#define IPE_POLICER_FULL_WRITE_REQUEST_REXP_MASK 4
+#define IPE_POLICER_FULL_WRITE_REQUEST_REXP_SHIFT 22
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_ARM_OFFSET 9
+#define IPE_POLICER_FULL_WRITE_REQUEST_ARM_MASK 11
+#define IPE_POLICER_FULL_WRITE_REQUEST_ARM_SHIFT 11
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_PRM_OFFSET 10
+#define IPE_POLICER_FULL_WRITE_REQUEST_PRM_MASK 11
+#define IPE_POLICER_FULL_WRITE_REQUEST_PRM_SHIFT 0
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLE_OFFSET 12
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLE_MASK 5
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLE_SHIFT 27
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLM_OFFSET 12
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLM_MASK 7
+#define IPE_POLICER_FULL_WRITE_REQUEST_CBLM_SHIFT 20
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLE_OFFSET 13
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLE_MASK 5
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLE_SHIFT 15
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLM_OFFSET 14
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLM_MASK 7
+#define IPE_POLICER_FULL_WRITE_REQUEST_EBLM_SHIFT 8
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_CB_OFFSET 16
+#define IPE_POLICER_FULL_WRITE_REQUEST_CB_MASK 31
+#define IPE_POLICER_FULL_WRITE_REQUEST_CB_SHIFT 0
+
+#define IPE_POLICER_FULL_WRITE_REQUEST_EB_OFFSET 20
+#define IPE_POLICER_FULL_WRITE_REQUEST_EB_MASK 31
+#define IPE_POLICER_FULL_WRITE_REQUEST_EB_SHIFT 0
+
+#define IPE_RFC_RFC2697 0x00000000
+#define IPE_RFC_RFC2698 0x00000001
+#define IPE_RFC_RFC4115 0x00000002
+#define IPE_RFC_MEF5CF1 0x00000003
+
+/* End of constants copied from sse_ipe_desc_fmt.h */
+
+/* Misc Policer specific definitions */
+#define SSE2_QOS_POLICER_FIXED_PKT_SIZE 256
+
+// TODO check what can be provided by hw macro based on ASIC
+#define SSE2_QOS_POL_TICKS_PER_SEC 1000LL /* 1 tick = 1 ms */
+
+/*
+ * Default burst, in ms (byte format)
+ */
+#define SSE2_QOS_POL_DEF_BURST_BYTE 100
+
+/*
+ * Minimum burst needs to be such that the largest packet size is accomodated
+ */
+// Do we need to get it from some lib?
+#define SSE2_QOS_POL_MIN_BURST_BYTE 9*1024
+
+
+/*
+ * Flag to indicate if AN is employed or not
+ * 1 - TRUE, 0 - FALSE
+ */
+#define SSE2_QOS_POL_ALLOW_NEGATIVE 1
+
+// Various Macros to take care of policer calculations
+
+#define SSE2_QOS_POL_COMM_BKT_MAX \
+ (1<<IPE_POLICER_FULL_WRITE_REQUEST_CB_MASK)
+#define SSE2_QOS_POL_EXTD_BKT_MAX \
+ (1<<IPE_POLICER_FULL_WRITE_REQUEST_EB_MASK)
+
+#define SSE2_QOS_POL_RATE_EXP_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_REXP_MASK)
+#define SSE2_QOS_POL_RATE_EXP_MAX ((1<<SSE2_QOS_POL_RATE_EXP_SIZE) - 1)
+#define SSE2_QOS_POL_AVG_RATE_MANT_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_ARM_MASK)
+#define SSE2_QOS_POL_AVG_RATE_MANT_MAX \
+ ((1<< SSE2_QOS_POL_AVG_RATE_MANT_SIZE) - 1)
+#define SSE2_QOS_POL_AVG_RATE_MAX \
+ (SSE2_QOS_POL_AVG_RATE_MANT_MAX << \
+ SSE2_QOS_POL_RATE_EXP_MAX)
+
+#define SSE2_QOS_POL_PEAK_RATE_MANT_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_PRM_MASK)
+#define SSE2_QOS_POL_PEAK_RATE_MANT_MAX \
+ ((1<<SSE2_QOS_POL_PEAK_RATE_MANT_SIZE) - 1)
+#define SSE2_QOS_POL_PEAK_RATE_MAX \
+ (SSE2_QOS_POL_PEAK_RATE_MANT_MAX << \
+ SSE2_QOS_POL_RATE_EXP_MAX)
+
+#define SSE2_QOS_POL_COMM_BKT_LIMIT_MANT_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_CBLM_MASK)
+#define SSE2_QOS_POL_COMM_BKT_LIMIT_MANT_MAX \
+ ((1<<SSE2_QOS_POL_COMM_BKT_LIMIT_MANT_SIZE) - 1)
+#define SSE2_QOS_POL_COMM_BKT_LIMIT_EXP_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_CBLE_MASK)
+#define SSE2_QOS_POL_COMM_BKT_LIMIT_EXP_MAX \
+ ((1<<SSE2_QOS_POL_COMM_BKT_LIMIT_EXP_SIZE) - 1)
+#define SSE2_QOS_POL_COMM_BKT_LIMIT_MAX \
+ ((uint64_t)SSE2_QOS_POL_COMM_BKT_LIMIT_MANT_MAX << \
+ (uint64_t)SSE2_QOS_POL_COMM_BKT_LIMIT_EXP_MAX)
+
+#define SSE2_QOS_POL_EXTD_BKT_LIMIT_MANT_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_EBLM_MASK)
+#define SSE2_QOS_POL_EXTD_BKT_LIMIT_MANT_MAX \
+ ((1<<SSE2_QOS_POL_EXTD_BKT_LIMIT_MANT_SIZE) - 1)
+#define SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_SIZE \
+ (IPE_POLICER_FULL_WRITE_REQUEST_EBLE_MASK)
+#define SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_MAX \
+ ((1<<SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_SIZE) - 1)
+#define SSE2_QOS_POL_EXT_BKT_LIMIT_MAX \
+ ((uint64_t)SSE2_QOS_POL_EXTD_BKT_LIMIT_MANT_MAX << \
+ (uint64_t)SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_MAX)
+
+/*
+ * Rates determine the units of the bucket
+ * 256.114688 Gbps < Rate 8 byte units
+ * 128.057344 Gbps < Rate <= 256.114688 Gbps 4 byte units
+ * 64.028672 Gbps < Rate <= 128.057344 Gbps 2 byte units
+ * Rate <= 64.028672 Gbps 1 byte units
+ *
+ * The code uses bytes per tick as oppose to Gigabits per second.
+ */
+#define RATE256 (256114688000LL / 8LL / SSE2_QOS_POL_TICKS_PER_SEC)
+#define RATE128 (128057344000LL / 8LL / SSE2_QOS_POL_TICKS_PER_SEC)
+#define RATE64 ( 64028672000LL / 8LL / SSE2_QOS_POL_TICKS_PER_SEC)
+
+#define RATE_OVER256_UNIT 8LL
+#define RATE_128TO256_UNIT 4LL
+#define RATE_64TO128_UNIT 2LL
+
+static cerrno
+sse2_qos_pol_round (uint64_t numerator,
+ uint64_t denominator,
+ uint64_t *rounded_value,
+ sse2_qos_round_type_en round_type)
+{
+ cerrno rc = EOK;
+
+ if (denominator == 0) {
+ SSE2_QOS_DEBUG_ERROR("Illegal denominator");
+ SSE2_QOS_TR_ERR(SSE2_QOSRM_TP_ERR_59);
+ return(EINVAL);
+ }
+
+ switch (round_type) {
+ case SSE2_QOS_ROUND_TO_CLOSEST:
+ *rounded_value = ((numerator + (denominator >> 1)) / denominator);
+ break;
+
+ case SSE2_QOS_ROUND_TO_UP:
+ *rounded_value = (numerator / denominator);
+ if ((*rounded_value * denominator) < numerator) {
+ *rounded_value += 1;
+ }
+ break;
+
+ case SSE2_QOS_ROUND_TO_DOWN:
+ *rounded_value = (numerator / denominator);
+ break;
+
+ case SSE2_QOS_ROUND_INVALID:
+ default:
+ SSE2_QOS_DEBUG_ERROR("Illegal round type");
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_60, round_type);
+ rc = EINVAL;
+ break;
+ }
+ return(rc);
+}
+
+
+static cerrno
+sse2_pol_validate_cfg_params (sse2_qos_pol_cfg_params_st *cfg)
+{
+ uint64_t numer, denom, rnd_value;
+ uint32_t cir_hw, eir_hw;
+ cerrno rc = EOK;
+
+ if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698) &&
+ (cfg->rb.kbps.eir_kbps < cfg->rb.kbps.cir_kbps)) {
+ SSE2_QOS_DEBUG_ERROR("CIR (%u kbps) is greater than PIR (%u kbps)",
+ cfg->rb.kbps.cir_kbps, cfg->rb.kbps.eir_kbps);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_39, cfg->rb.kbps.cir_kbps,
+ cfg->rb.kbps.eir_kbps);
+ return(EINVAL);
+ }
+
+ /*
+ * convert rates to bytes-per-tick
+ */
+ numer = (uint64_t)(cfg->rb.kbps.cir_kbps);
+ denom = (uint64_t)(8 * SSE2_QOS_POL_TICKS_PER_SEC) / 1000;
+ rc = sse2_qos_pol_round(numer, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert CIR to bytes/tick format");
+ // Error traced
+ return(rc);
+ }
+ cir_hw = (uint32_t)rnd_value;
+
+ numer = (uint64_t)(cfg->rb.kbps.eir_kbps);
+ rc = sse2_qos_pol_round(numer, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert EIR to bytes/tick format");
+ // Error traced
+ return(rc);
+ }
+ eir_hw = (uint32_t)rnd_value;
+
+ if (cir_hw > SSE2_QOS_POL_AVG_RATE_MAX) {
+ SSE2_QOS_DEBUG_ERROR("hw cir (%u bytes/tick) is greater than the "
+ "max supported value (%u)", cir_hw,
+ SSE2_QOS_POL_AVG_RATE_MAX);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_84, cir_hw,
+ SSE2_QOS_POL_AVG_RATE_MAX);
+ return(EINVAL);
+ }
+
+ if (eir_hw > SSE2_QOS_POL_PEAK_RATE_MAX) {
+ SSE2_QOS_DEBUG_ERROR("hw eir (%u bytes/tick) is greater than the "
+ "max supported value (%u). Capping it to the max. "
+ "supported value", eir_hw, SSE2_QOS_POL_PEAK_RATE_MAX);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_85,
+ eir_hw, SSE2_QOS_POL_PEAK_RATE_MAX);
+ return(EINVAL);
+ }
+ /*
+ * CIR = 0, with bc != 0 is not allowed
+ */
+ if ((cfg->rb.kbps.cir_kbps == 0) && cfg->rb.kbps.cb_bytes) {
+ SSE2_QOS_DEBUG_ERROR("CIR = 0 with bc != 0");
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_55);
+ return(EINVAL);
+ }
+
+ if ((cfg->rb.kbps.eir_kbps == 0) &&
+ (cfg->rfc > SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697)) {
+ SSE2_QOS_DEBUG_ERROR("EIR = 0 for a 2R3C policer (rfc: %u)", cfg->rfc);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_23, cfg->rb.kbps.eir_kbps, cfg->rfc);
+ return(EINVAL);
+ }
+
+ if (cfg->rb.kbps.eir_kbps &&
+ (cfg->rfc < SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698)) {
+ SSE2_QOS_DEBUG_ERROR("EIR: %u kbps for a 1-rate policer (rfc: %u)",
+ cfg->rb.kbps.eir_kbps, cfg->rfc);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_23, cfg->rb.kbps.eir_kbps, cfg->rfc);
+ return(EINVAL);
+ }
+
+ if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) && cfg->rb.kbps.eb_bytes) {
+ SSE2_QOS_DEBUG_ERROR("For a 1R1B policer, EB burst cannot be > 0");
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_56);
+ return(EINVAL);
+ }
+
+ return(EOK);
+}
+
+static void
+sse2_qos_convert_value_to_exp_mant_fmt (uint64_t value,
+ uint16_t max_exp_value,
+ uint16_t max_mant_value,
+ sse2_qos_round_type_en type,
+ uint8_t *exp,
+ uint32_t *mant)
+{
+ uint64_t rnd_value;
+ uint64_t temp_mant;
+ uint8_t temp_exp;
+
+ /*
+ * Select the lowest possible exp, and the largest possible mant
+ */
+ temp_exp = 0;
+ temp_mant = value;
+ while (temp_exp <= max_exp_value) {
+ if (temp_mant <= max_mant_value) {
+ break;
+ }
+
+ temp_exp++;
+ rnd_value = 0;
+ (void)sse2_qos_pol_round((uint64_t)value, (uint64_t)(1 << temp_exp),
+ &rnd_value, type);
+ temp_mant = rnd_value;
+ }
+
+ if (temp_exp > max_exp_value) {
+ /*
+ * CAP mant to its max value, and decrement exp
+ */
+ temp_exp--;
+ temp_mant = max_mant_value;
+ }
+
+ *exp = temp_exp;
+ *mant = (uint32_t)temp_mant;
+
+ SSE2_QOS_DEBUG_INFO("value: 0x%llx, mant: %u, exp: %u", value, *mant, *exp);
+ return;
+}
+
+static cerrno
+sse2_pol_convert_cfg_rates_to_hw (sse2_qos_pol_cfg_params_st *cfg,
+ sse2_qos_pol_hw_params_st *hw)
+{
+ cerrno rc = EOK;
+ uint32_t cir_hw, eir_hw, hi_mant, hi_rate, cir_rnded, eir_rnded, eir_kbps;
+ uint64_t numer, denom, rnd_value;
+ uint8_t exp;
+
+ /*
+ * convert rates to bytes-per-tick (tick is 1ms)
+ * For rate conversion, the denominator is gonna be the same
+ */
+ denom = (uint64_t)((SSE2_QOS_POL_TICKS_PER_SEC * 8) / 1000);
+ numer = (uint64_t)(cfg->rb.kbps.cir_kbps);
+ rc = sse2_qos_pol_round(numer, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Rounding error, rate: %d kbps, rounding_type: %d",
+ cfg->rb.kbps.cir_kbps, cfg->rnd_type);
+ // Error is traced
+ return(rc);
+ }
+ cir_hw = (uint32_t)rnd_value;
+
+ if (cfg->rb.kbps.cir_kbps && (cir_hw == 0)) {
+ /*
+ * After rounding, cir_hw = 0. Bump it up
+ */
+ cir_hw = 1;
+ }
+
+ if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) {
+ eir_kbps = 0;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697) {
+ eir_kbps = cfg->rb.kbps.cir_kbps;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115) {
+ eir_kbps = cfg->rb.kbps.eir_kbps - cfg->rb.kbps.cir_kbps;
+ } else {
+ eir_kbps = cfg->rb.kbps.eir_kbps;
+ }
+
+ numer = (uint64_t)eir_kbps;
+ rc = sse2_qos_pol_round(numer, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Rounding error, rate: %d kbps, rounding_type: %d",
+ eir_kbps, cfg->rnd_type);
+ // Error is traced
+ return(rc);
+ }
+ eir_hw = (uint32_t)rnd_value;
+
+ if (eir_kbps && (eir_hw == 0)) {
+ /*
+ * After rounding, eir_hw = 0. Bump it up
+ */
+ eir_hw = 1;
+ }
+
+ SSE2_QOS_DEBUG_INFO("cir_hw: %u bytes/tick, eir_hw: %u bytes/tick", cir_hw,
+ eir_hw);
+
+ if (cir_hw > eir_hw) {
+ hi_rate = cir_hw;
+ } else {
+ hi_rate = eir_hw;
+ }
+
+ if ((cir_hw == 0) && (eir_hw == 0)) {
+ /*
+ * Both the rates are 0. Use exp = 15, and set the RFC to 4115. Also
+ * set AN = 0
+ */
+ exp = (uint8_t)SSE2_QOS_POL_RATE_EXP_MAX;
+ hi_mant = 0;
+ hw->rfc = IPE_RFC_RFC4115;
+ hw->allow_negative = 0;
+ } else {
+ sse2_qos_convert_value_to_exp_mant_fmt(hi_rate,
+ (uint16_t)SSE2_QOS_POL_RATE_EXP_MAX,
+ (uint16_t)SSE2_QOS_POL_AVG_RATE_MANT_MAX,
+ (sse2_qos_round_type_en) cfg->rnd_type,
+ &exp, &hi_mant);
+ }
+
+ denom = (1 << exp);
+ if (hi_rate == eir_hw) {
+ hw->peak_rate_man = (uint16_t)hi_mant;
+ rc = sse2_qos_pol_round((uint64_t)cir_hw, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ hw->avg_rate_man = (uint16_t)rnd_value;
+ } else {
+ hw->avg_rate_man = (uint16_t)hi_mant;
+ rc = sse2_qos_pol_round((uint64_t)eir_hw, denom, &rnd_value,
+ (sse2_qos_round_type_en) cfg->rnd_type);
+ hw->peak_rate_man = (uint16_t)rnd_value;
+ }
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Rounding error");
+ // Error is traced
+ return(rc);
+ }
+ hw->rate_exp = exp;
+
+ if ((hw->avg_rate_man == 0) && (cfg->rb.kbps.cir_kbps)) {
+ /*
+ * cir was reduced to 0 during rounding. Bump it up
+ */
+ hw->avg_rate_man = 1;
+ SSE2_QOS_DEBUG_INFO("CIR = 0 during rounding. Bump it up to %u "
+ "bytes/tick", (hw->avg_rate_man << hw->rate_exp));
+ }
+
+ if ((hw->peak_rate_man == 0) && eir_kbps) {
+ /*
+ * eir was reduced to 0 during rounding. Bump it up
+ */
+ hw->peak_rate_man = 1;
+ SSE2_QOS_DEBUG_INFO("EIR = 0 during rounding. Bump it up to %u "
+ "bytes/tick", (hw->peak_rate_man << hw->rate_exp));
+ }
+
+ cir_rnded = (hw->avg_rate_man << hw->rate_exp);
+ eir_rnded = (hw->peak_rate_man << hw->rate_exp);
+
+ SSE2_QOS_DEBUG_INFO("Configured(rounded) values, cir: %u "
+ "kbps (mant: %u, exp: %u, rate: %u bytes/tick)",
+ cfg->rb.kbps.cir_kbps, hw->avg_rate_man,
+ hw->rate_exp, cir_rnded);
+
+ SSE2_QOS_DEBUG_INFO("Configured(rounded) values, eir: %u "
+ "kbps (mant: %u, exp: %u, rate: %u bytes/tick)",
+ cfg->rb.kbps.eir_kbps, hw->peak_rate_man,
+ hw->rate_exp, eir_rnded);
+
+ return(rc);
+}
+
+/*****
+ * NAME
+ * sse2_pol_get_bkt_max
+ *
+ * PARAMETERS
+ * rate_hw - either the averate rate or peak rate
+ * bkt_max - bit width in the current bucket or extended bucket
+ *
+ * RETURNS
+ * uint64_t - maximum token bytes for the current or extended bucket
+ *
+ * DESCRIPTION
+ * The current bucket or extended bucket fields are in units of either
+ * 1,2,4,8 bytes based on the average or peak rate respective to current
+ * or extended bucket.
+ *
+ * To get the actual maximum number of bytes that can be stored in the
+ * field, the value must be multiplied by the units of either 1,2,4,8
+ * bytes based on the rate.
+ *****/
+uint64_t
+sse2_pol_get_bkt_max (uint64_t rate_hw, uint64_t bkt_max)
+{
+ if (rate_hw <= RATE64) {
+ return (bkt_max - 1);
+ } else if (rate_hw <= RATE128) {
+ return ((bkt_max * RATE_64TO128_UNIT) - RATE_64TO128_UNIT);
+ } else if (rate_hw <= RATE256) {
+ return ((bkt_max * RATE_128TO256_UNIT) - RATE_128TO256_UNIT);
+ }
+ /* rate must be over 256 */
+ return ((bkt_max * RATE_OVER256_UNIT) - RATE_OVER256_UNIT);
+}
+
+/*****
+ * NAME
+ * sse2_pol_get_bkt_value
+ *
+ * PARAMETERS
+ * rate_hw - either the averate rate or peak rate
+ * byte_value - bytes for this token bucket
+ *
+ * RETURNS
+ * uint64_t - unit value for the current or extended bucket field
+ *
+ * DESCRIPTION
+ * The current bucket or extended bucket fields are in units of either
+ * 1,2,4,8 bytes based on the average or peak rate respective to current
+ * or extended bucket.
+ *
+ * To get the units that can be stored in the field, the byte value must
+ * be divided by the units of either 1,2,4,8 bytes based on the rate.
+ *****/
+uint64_t
+sse2_pol_get_bkt_value (uint64_t rate_hw, uint64_t byte_value)
+{
+ if (rate_hw <= RATE64) {
+ return (byte_value);
+ } else if (rate_hw <= RATE128) {
+ return (byte_value / RATE_64TO128_UNIT);
+ } else if (rate_hw <= RATE256) {
+ return (byte_value / RATE_128TO256_UNIT);
+ }
+ /* rate must be over 256 */
+ return (byte_value / RATE_OVER256_UNIT);
+}
+
+static void
+sse2_pol_rnd_burst_byte_fmt (uint64_t cfg_burst,
+ uint16_t max_exp_value,
+ uint16_t max_mant_value,
+ uint32_t max_bkt_value,
+ uint32_t rate_hw,
+ uint8_t *exp,
+ uint32_t *mant,
+ uint32_t *bkt_value)
+{
+ uint64_t bkt_max=max_bkt_value;
+ uint64_t bkt_limit_max;
+ uint64_t rnd_burst;
+ uint64_t temp_bkt_value;
+
+ bkt_limit_max = ((uint64_t)max_mant_value<<(uint64_t)max_exp_value);
+ bkt_max = sse2_pol_get_bkt_max(rate_hw, bkt_max);
+ bkt_max=MIN(bkt_max, bkt_limit_max);
+ if (!cfg_burst) {
+ /*
+ * If configured burst = 0, compute the burst to be 100ms at a given
+ * rate. Note that for rate_hw = 0, exp = mant = 0.
+ */
+ cfg_burst = (uint64_t)rate_hw * (uint64_t)SSE2_QOS_POL_DEF_BURST_BYTE;
+ }
+
+ if (cfg_burst > bkt_max) {
+ SSE2_QOS_DEBUG_ERROR("burst 0x%llx bytes is greater than the max. "
+ "supported value 0x%llx bytes. Capping it to the "
+ "max", cfg_burst, bkt_max);
+ SSE2_QOS_TR_INFO(SSE2_QOS_TP_INFO_38,
+ (uint)cfg_burst, (uint)bkt_max);
+ cfg_burst = bkt_max;
+ }
+
+ if (cfg_burst < SSE2_QOS_POL_MIN_BURST_BYTE) {
+ /*
+ * Bump up the burst value ONLY if the cfg_burst is non-zero AND
+ * less than the min. supported value
+ */
+ SSE2_QOS_DEBUG_INFO("burst 0x%llx bytes is less than the min "
+ "supported value %u bytes. Rounding it up to "
+ "the min", cfg_burst, SSE2_QOS_POL_MIN_BURST_BYTE);
+ SSE2_QOS_TR_INFO(SSE2_QOS_TP_INFO_39, (uint)cfg_burst,
+ SSE2_QOS_POL_MIN_BURST_BYTE);
+ cfg_burst = SSE2_QOS_POL_MIN_BURST_BYTE;
+ }
+
+ sse2_qos_convert_value_to_exp_mant_fmt(cfg_burst,
+ max_exp_value,
+ max_mant_value,
+ SSE2_QOS_ROUND_TO_DOWN,
+ exp,
+ mant);
+
+ /* Bucket value is based on rate. */
+ rnd_burst = ((uint64_t)(*mant) << (uint64_t)(*exp));
+ temp_bkt_value = sse2_pol_get_bkt_value(rate_hw, rnd_burst);
+ *bkt_value = (uint32_t)temp_bkt_value;
+}
+
+static cerrno
+sse2_pol_convert_cfg_burst_to_hw (sse2_qos_pol_cfg_params_st *cfg,
+ sse2_qos_pol_hw_params_st *hw)
+{
+ uint8_t temp_exp;
+ uint32_t temp_mant, rate_hw;
+ uint64_t eb_bytes;
+ uint32_t bkt_value;
+
+ /*
+ * compute Committed Burst
+ */
+ SSE2_QOS_DEBUG_INFO("Compute commit burst ...");
+ rate_hw = (hw->avg_rate_man) << (hw->rate_exp);
+ sse2_pol_rnd_burst_byte_fmt(cfg->rb.kbps.cb_bytes,
+ (uint16_t)SSE2_QOS_POL_COMM_BKT_LIMIT_EXP_MAX,
+ (uint16_t)SSE2_QOS_POL_COMM_BKT_LIMIT_MANT_MAX,
+ (uint32_t)SSE2_QOS_POL_COMM_BKT_MAX,
+ rate_hw, &temp_exp, &temp_mant, &bkt_value);
+ SSE2_QOS_DEBUG_INFO("Committed burst, burst_limit: 0x%llx mant : %u, "
+ "exp: %u, rnded: 0x%llx cb:%u bytes",
+ cfg->rb.kbps.cb_bytes, temp_mant, temp_exp,
+ ((uint64_t)temp_mant << (uint64_t)temp_exp), bkt_value);
+
+ hw->comm_bkt_limit_exp = temp_exp;
+ hw->comm_bkt_limit_man = (uint8_t)temp_mant;
+ hw->comm_bkt = bkt_value;
+
+ /*
+ * compute Exceed Burst
+ */
+ SSE2_QOS_DEBUG_INFO("Compute exceed burst ...");
+
+ if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) {
+ /*
+ * For 1R2C, hw uses 2R3C (RFC-4115). As such, the Exceed Bucket
+ * params are set to 0. Recommendation is to use EB_exp = max_exp (=15)
+ * and EB_mant = 0
+ */
+ hw->extd_bkt_limit_exp = (uint8_t)SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_MAX;
+ hw->extd_bkt_limit_man = 0;
+ SSE2_QOS_DEBUG_INFO("Excess burst, burst: 0x%llx mant: %u, "
+ "exp: %u, rnded: 0x%llx bytes",
+ cfg->rb.kbps.eb_bytes, hw->extd_bkt_limit_man,
+ hw->extd_bkt_limit_exp,
+ ((uint64_t)hw->extd_bkt_limit_man <<
+ (uint64_t)hw->extd_bkt_limit_exp));
+ SSE2_QOS_TR_INFO(SSE2_QOS_TP_INFO_20, (uint)cfg->rb.kbps.eb_bytes,
+ hw->extd_bkt_limit_man, hw->extd_bkt_limit_exp);
+ return(EOK);
+ }
+
+ if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697) {
+ eb_bytes = cfg->rb.kbps.cb_bytes + cfg->rb.kbps.eb_bytes;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115) {
+ eb_bytes = cfg->rb.kbps.eb_bytes - cfg->rb.kbps.cb_bytes;
+ } else {
+ eb_bytes = cfg->rb.kbps.eb_bytes;
+ }
+
+ rate_hw = (hw->peak_rate_man) << (hw->rate_exp);
+ sse2_pol_rnd_burst_byte_fmt(eb_bytes,
+ (uint16_t)SSE2_QOS_POL_EXTD_BKT_LIMIT_EXP_MAX,
+ (uint16_t)SSE2_QOS_POL_EXTD_BKT_LIMIT_MANT_MAX,
+ (uint32_t)SSE2_QOS_POL_EXTD_BKT_MAX,
+ rate_hw, &temp_exp, &temp_mant, &bkt_value);
+
+ SSE2_QOS_DEBUG_INFO("Excess burst, burst_limit: 0x%llx mant: %u, "
+ "exp: %u, rnded: 0x%llx eb:%u bytes",
+ cfg->rb.kbps.eb_bytes, temp_mant, temp_exp,
+ ((uint64_t)temp_mant << (uint64_t)temp_exp), bkt_value);
+
+ hw->extd_bkt_limit_exp = (uint8_t)temp_exp;
+ hw->extd_bkt_limit_man = (uint8_t)temp_mant;
+ hw->extd_bkt = bkt_value;
+
+ return(EOK);
+}
+
+
+/*
+ * Input: configured parameter values in 'cfg'.
+ * Output: h/w programmable parameter values in 'hw'.
+ * Return: success or failure code.
+ */
+static cerrno
+sse2_pol_convert_cfg_to_hw_params (sse2_qos_pol_cfg_params_st *cfg,
+ sse2_qos_pol_hw_params_st *hw)
+{
+ cerrno rc = EOK;
+
+ /*
+ * clear the hw_params
+ */
+ memset(hw, 0, sizeof(sse2_qos_pol_hw_params_st));
+
+ hw->allow_negative = SSE2_QOS_POL_ALLOW_NEGATIVE;
+
+ if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) ||
+ (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115)) {
+ hw->rfc = IPE_RFC_RFC4115;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697) {
+ hw->rfc = IPE_RFC_RFC2697;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698) {
+ hw->rfc = IPE_RFC_RFC2698;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_MEF5CF1) {
+ hw->rfc = IPE_RFC_MEF5CF1;
+ } else {
+ SSE2_QOS_DEBUG_ERROR("Invalid RFC type %d\n", cfg->rfc);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_61, cfg->rfc);
+ return(EINVAL);
+ }
+
+ rc = sse2_pol_convert_cfg_rates_to_hw(cfg, hw);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert config rates to hw. Error: %d",
+ rc);
+ // Error is traced
+ return(rc);
+ }
+
+ rc = sse2_pol_convert_cfg_burst_to_hw(cfg, hw);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert config burst to hw. Error: %d",
+ rc);
+ // Error is traced
+ return(rc);
+ }
+
+ return OK_pushHW;
+}
+
+
+uint32_t
+sse2_qos_convert_pps_to_kbps (uint32_t rate_pps)
+{
+ // sse2_qos_ship_inc_counter(SSE2_QOS_SHIP_COUNTER_TYPE_API_CNT,
+ // SSE2_QOS_SHIP_CNT_POL_CONV_PPS_TO_KBPS);
+
+ uint64_t numer, rnd_value = 0;
+
+ numer = (uint64_t)((uint64_t)rate_pps *
+ (uint64_t)SSE2_QOS_POLICER_FIXED_PKT_SIZE * 8LL);
+ (void)sse2_qos_pol_round(numer, 1000LL, &rnd_value,
+ SSE2_QOS_ROUND_TO_CLOSEST);
+
+ return ((uint32_t)rnd_value);
+}
+
+uint32_t
+sse2_qos_convert_burst_ms_to_bytes (uint32_t burst_ms,
+ uint32_t rate_kbps)
+{
+ uint64_t numer, rnd_value = 0;
+
+ //sse2_qos_ship_inc_counter(SSE2_QOS_SHIP_COUNTER_TYPE_API_CNT,
+ // SSE2_QOS_SHIP_CNT_POL_CONV_BURST_MS_TO_BYTES);
+
+ numer = (uint64_t)((uint64_t)burst_ms * (uint64_t)rate_kbps);
+
+ (void)sse2_qos_pol_round(numer, 8LL, &rnd_value,
+ SSE2_QOS_ROUND_TO_CLOSEST);
+
+ return ((uint32_t)rnd_value);
+}
+
+
+/*
+ * Input: configured parameters in 'cfg'.
+ * Output: h/w parameters are returned in 'hw',
+ * Return: Status, success or failure code.
+ */
+trans_layer_rc
+sse2_pol_compute_hw_params (sse2_qos_pol_cfg_params_st *cfg,
+ sse2_qos_pol_hw_params_st *hw)
+{
+ cerrno rc = EOK;
+
+ if (!cfg || !hw) {
+ SSE2_QOS_DEBUG_ERROR("Illegal parameters");
+ return(Not_OK);
+ }
+
+ /*
+ * Validate the police config params being presented to RM
+ */
+ rc = sse2_pol_validate_cfg_params(cfg);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Config parameter validation failed. Error: %d",
+ rc);
+ // Error is traced
+ return(Not_OK);
+ }
+
+ /*
+ * first round configured values to h/w supported values. This func
+ * also determines whether 'tick' or 'byte' format
+ */
+ rc = sse2_pol_convert_cfg_to_hw_params(cfg, hw);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert config params to hw params. "
+ "Error: %d", rc);
+ SSE2_QOS_TR_ERR(SSE2_QOS_TP_ERR_53, rc);
+ return(Not_OK);
+ }
+
+ return OK_pushHW;
+}
+
+
+#if defined (INTERNAL_SS) || defined (X86)
+
+// For initializing the x86 policer format
+
+/*
+ * Return the number of hardware TSC timer ticks per second for the dataplane.
+ * This is approximately, but not exactly, the clock speed.
+ */
+static uint64_t get_tsc_hz(void)
+{
+ f64 cpu_freq;
+
+ cpu_freq = os_cpu_clock_frequency();
+ return (uint64_t) cpu_freq;
+}
+
+/*
+ * Convert rates into bytes_per_period and scale.
+ * Return 0 if ok or 1 if error.
+ */
+static int
+compute_policer_params (uint64_t hz, // CPU speed in clocks per second
+ uint64_t cir_rate, // in bytes per second
+ uint64_t pir_rate, // in bytes per second
+ uint32_t *current_limit, // in bytes, output may scale the input
+ uint32_t *extended_limit, // in bytes, output may scale the input
+ uint32_t *cir_bytes_per_period,
+ uint32_t *pir_bytes_per_period,
+ uint32_t *scale)
+{
+ double period;
+ double internal_cir_bytes_per_period;
+ double internal_pir_bytes_per_period;
+ uint32_t max;
+ uint32_t scale_shift;
+ uint32_t scale_amount;
+ uint32_t orig_current_limit = *current_limit;
+
+ // Compute period. For 1Ghz-to-8Ghz CPUs, the period will be in
+ // the range of 16 to 116 usec.
+ period = ((double) hz) / ((double) POLICER_TICKS_PER_PERIOD);
+
+ // Determine bytes per period for each rate
+ internal_cir_bytes_per_period = (double)cir_rate / period;
+ internal_pir_bytes_per_period = (double)pir_rate / period;
+
+ // Scale if possible. Scaling helps rate accuracy, but is contrained
+ // by the scaled rates and limits fitting in 32-bits.
+ // In addition, we need to insure the scaled rate is no larger than
+ // 2^22 tokens per period. This allows the dataplane to ignore overflow
+ // in the tokens-per-period multiplication since it could only
+ // happen if the policer were idle for more than a year.
+ // This is not really a constraint because 100Gbps at 1Ghz is only
+ // 1.6M tokens per period.
+#define MAX_RATE_SHIFT 10
+ max = MAX(*current_limit, *extended_limit);
+ max = MAX(max, (uint32_t)internal_cir_bytes_per_period << MAX_RATE_SHIFT);
+ max = MAX(max, (uint32_t)internal_pir_bytes_per_period << MAX_RATE_SHIFT);
+ scale_shift = __builtin_clz(max);
+
+ scale_amount = 1 << scale_shift;
+ *scale = scale_shift;
+
+ // Scale the limits
+ *current_limit = *current_limit << scale_shift;
+ *extended_limit = *extended_limit << scale_shift;
+
+ // Scale the rates
+ internal_cir_bytes_per_period = internal_cir_bytes_per_period * ((double)scale_amount);
+ internal_pir_bytes_per_period = internal_pir_bytes_per_period * ((double)scale_amount);
+
+ // Make sure the new rates are reasonable
+ // Only needed for very low rates with large bursts
+ if (internal_cir_bytes_per_period < 1.0) {
+ internal_cir_bytes_per_period = 1.0;
+ }
+ if (internal_pir_bytes_per_period < 1.0) {
+ internal_pir_bytes_per_period = 1.0;
+ }
+
+ *cir_bytes_per_period = (uint32_t)internal_cir_bytes_per_period;
+ *pir_bytes_per_period = (uint32_t)internal_pir_bytes_per_period;
+
+// #define PRINT_X86_POLICE_PARAMS
+#ifdef PRINT_X86_POLICE_PARAMS
+ {
+ uint64_t effective_BPS;
+
+ // This value actually slightly conservative because it doesn't take into account
+ // the partial period at the end of a second. This really matters only for very low
+ // rates.
+ effective_BPS = (((uint64_t) (*cir_bytes_per_period * (uint64_t)period)) >> *scale );
+
+ printf("hz=%llu, cir_rate=%llu, limit=%u => "
+ "periods-per-sec=%d usec-per-period=%d => "
+ "scale=%d cir_BPP=%u, scaled_limit=%u => "
+ "effective BPS=%llu, accuracy=%f\n",
+ // input values
+ (unsigned long long)hz,
+ (unsigned long long)cir_rate,
+ orig_current_limit,
+ // computed values
+ (uint32_t)(period), // periods per second
+ (uint32_t)(1000.0 * 1000.0 / period), // in usec
+ *scale,
+ *cir_bytes_per_period,
+ *current_limit,
+ // accuracy
+ (unsigned long long)effective_BPS,
+ (double)cir_rate / (double)effective_BPS);
+ }
+#else
+ orig_current_limit = orig_current_limit; // Make compiler happy
+#endif
+
+ return 0; // ok
+}
+
+
+/*
+ * Input: configured parameters in 'cfg'.
+ * Output: h/w parameters are returned in 'hw',
+ * Return: Status, success or failure code.
+ */
+trans_layer_rc
+x86_pol_compute_hw_params (sse2_qos_pol_cfg_params_st *cfg,
+ policer_read_response_type_st *hw)
+{
+ const int BYTES_PER_KBIT = (1000 / 8);
+ uint64_t hz;
+ uint32_t cap;
+
+ if (!cfg || !hw) {
+ SSE2_QOS_DEBUG_ERROR("Illegal parameters");
+ return(Not_OK);
+ }
+
+ hz = get_tsc_hz();
+ hw->last_update_time = 0;
+
+ // Cap the bursts to 32-bits. This allows up to almost one second of
+ // burst on a 40GE interface, which should be fine for x86.
+ cap = (cfg->rb.kbps.cb_bytes > 0xFFFFFFFF) ? 0xFFFFFFFF : cfg->rb.kbps.cb_bytes;
+ hw->current_limit = cap;
+ cap = (cfg->rb.kbps.eb_bytes > 0xFFFFFFFF) ? 0xFFFFFFFF : cfg->rb.kbps.eb_bytes;
+ hw->extended_limit = cap;
+
+ if ((cfg->rb.kbps.cir_kbps == 0) && (cfg->rb.kbps.cb_bytes == 0) && (cfg->rb.kbps.eb_bytes == 0)) {
+ // This is a uninitialized, always-violate policer
+ hw->single_rate = 1;
+ hw->cir_tokens_per_period = 0;
+ return OK_pushHW;
+ }
+
+ if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) ||
+ (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697)) {
+ // Single-rate policer
+
+ hw->single_rate = 1;
+
+ if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_1R2C) && cfg->rb.kbps.eb_bytes) {
+ SSE2_QOS_DEBUG_ERROR("Policer parameter validation failed -- 1R2C.");
+ return(Not_OK);
+ }
+
+ if ((cfg->rb.kbps.cir_kbps == 0) ||
+ (cfg->rb.kbps.eir_kbps != 0) ||
+ ((cfg->rb.kbps.cb_bytes == 0) && (cfg->rb.kbps.eb_bytes == 0))) {
+ SSE2_QOS_DEBUG_ERROR("Policer parameter validation failed -- 1R.");
+ return(Not_OK);
+ }
+
+ if (compute_policer_params(hz,
+ (uint64_t)cfg->rb.kbps.cir_kbps * BYTES_PER_KBIT,
+ 0,
+ &hw->current_limit,
+ &hw->extended_limit,
+ &hw->cir_tokens_per_period,
+ &hw->pir_tokens_per_period,
+ &hw->scale)) {
+ SSE2_QOS_DEBUG_ERROR("Policer parameter computation failed.");
+ return(Not_OK);
+ }
+
+ } else if ((cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698) ||
+ (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115)) {
+ // Two-rate policer
+
+ if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115) {
+ hw->color_aware = 1;
+ }
+
+ if ((cfg->rb.kbps.cir_kbps == 0) || (cfg->rb.kbps.eir_kbps == 0) || (cfg->rb.kbps.eir_kbps < cfg->rb.kbps.cir_kbps) ||
+ (cfg->rb.kbps.cb_bytes == 0) || (cfg->rb.kbps.eb_bytes == 0)) {
+ SSE2_QOS_DEBUG_ERROR("Config parameter validation failed.");
+ return(Not_OK);
+ }
+
+ if (compute_policer_params(hz,
+ (uint64_t)cfg->rb.kbps.cir_kbps * BYTES_PER_KBIT,
+ (uint64_t)cfg->rb.kbps.eir_kbps * BYTES_PER_KBIT,
+ &hw->current_limit,
+ &hw->extended_limit,
+ &hw->cir_tokens_per_period,
+ &hw->pir_tokens_per_period,
+ &hw->scale)) {
+ SSE2_QOS_DEBUG_ERROR("Policer parameter computation failed.");
+ return(Not_OK);
+ }
+
+ } else {
+ SSE2_QOS_DEBUG_ERROR("Config parameter validation failed. RFC not supported");
+ return(Not_OK);
+ }
+
+ hw->current_bucket = hw->current_limit;
+ hw->extended_bucket = hw->extended_limit;
+
+ return OK_pushHW;
+}
+#endif
+
+
+/*
+ * Input: configured parameters in 'cfg'.
+ * Output: physical structure is returned in 'phys',
+ * Return: Status, success or failure code.
+ */
+trans_layer_rc
+sse2_pol_logical_2_physical (sse2_qos_pol_cfg_params_st *cfg,
+ policer_read_response_type_st *phys)
+{
+ trans_layer_rc rc;
+ sse2_qos_pol_hw_params_st pol_hw;
+ sse2_qos_pol_cfg_params_st kbps_cfg;
+
+ memset(phys, 0, sizeof(policer_read_response_type_st));
+ memset(&kbps_cfg, 0, sizeof(sse2_qos_pol_cfg_params_st));
+
+ if (!cfg) {
+ SSE2_QOS_DEBUG_ERROR("Illegal parameters");
+ return(Not_OK);
+ }
+
+ switch (cfg->rate_type) {
+ case SSE2_QOS_RATE_KBPS:
+ /* copy all the data into kbps_cfg */
+ kbps_cfg.rb.kbps.cir_kbps = cfg->rb.kbps.cir_kbps;
+ kbps_cfg.rb.kbps.eir_kbps = cfg->rb.kbps.eir_kbps;
+ kbps_cfg.rb.kbps.cb_bytes = cfg->rb.kbps.cb_bytes;
+ kbps_cfg.rb.kbps.eb_bytes = cfg->rb.kbps.eb_bytes;
+ break;
+ case SSE2_QOS_RATE_PPS:
+ kbps_cfg.rb.kbps.cir_kbps =
+ sse2_qos_convert_pps_to_kbps(cfg->rb.pps.cir_pps);
+ kbps_cfg.rb.kbps.eir_kbps =
+ sse2_qos_convert_pps_to_kbps(cfg->rb.pps.eir_pps);
+ kbps_cfg.rb.kbps.cb_bytes = sse2_qos_convert_burst_ms_to_bytes(
+ (uint32_t) cfg->rb.pps.cb_ms, kbps_cfg.rb.kbps.cir_kbps);
+ kbps_cfg.rb.kbps.eb_bytes = sse2_qos_convert_burst_ms_to_bytes(
+ (uint32_t) cfg->rb.pps.eb_ms, kbps_cfg.rb.kbps.eir_kbps);
+ break;
+ default:
+ SSE2_QOS_DEBUG_ERROR("Illegal rate type");
+ return(Not_OK);
+ }
+
+ /* rate type is now converted to kbps */
+ kbps_cfg.rate_type = SSE2_QOS_RATE_KBPS;
+ kbps_cfg.rnd_type = cfg->rnd_type;
+ kbps_cfg.rfc = cfg->rfc;
+
+#if !defined (INTERNAL_SS) && !defined (X86)
+ // convert logical into hw params which involves qos calculations
+ rc = sse2_pol_compute_hw_params(&kbps_cfg, &pol_hw);
+ if (rc == Not_OK) {
+ SSE2_QOS_DEBUG_ERROR("Unable to compute hw param. Error: %d", rc);
+ return (rc);
+ }
+
+ // convert hw params into the physical
+ phys->rfc = pol_hw.rfc;
+ phys->an = pol_hw.allow_negative;
+ phys->rexp = pol_hw.rate_exp;
+ phys->arm = pol_hw.avg_rate_man;
+ phys->prm = pol_hw.peak_rate_man;
+ phys->cble = pol_hw.comm_bkt_limit_exp;
+ phys->cblm = pol_hw.comm_bkt_limit_man;
+ phys->eble = pol_hw.extd_bkt_limit_exp;
+ phys->eblm = pol_hw.extd_bkt_limit_man;
+ phys->cb = pol_hw.comm_bkt;
+ phys->eb = pol_hw.extd_bkt;
+
+ /* for debugging purposes, the bucket token values can be overwritten */
+ if (cfg->overwrite_bucket) {
+ phys->cb = cfg->current_bucket;
+ phys->eb = cfg->extended_bucket;
+ }
+#else
+ // convert logical into hw params which involves qos calculations
+ rc = x86_pol_compute_hw_params(&kbps_cfg, phys);
+ if (rc == Not_OK) {
+ SSE2_QOS_DEBUG_ERROR("Unable to compute hw param. Error: %d", rc);
+ return (rc);
+ }
+
+ /* for debugging purposes, the bucket token values can be overwritten */
+ if (cfg->overwrite_bucket) {
+ phys->current_bucket = cfg->current_bucket;
+ phys->extended_bucket = cfg->extended_bucket;
+ }
+
+ // Touch to avoid compiler warning for X86
+ pol_hw.allow_negative = pol_hw.allow_negative;
+
+#endif // if !defined (INTERNAL_SS) && !defined (X86)
+
+ return OK_pushHW;
+}
+
+
+static void
+sse2_qos_convert_pol_bucket_to_hw_fmt (
+ policer_read_response_type_st *bkt,
+ sse2_qos_pol_hw_params_st *hw_fmt)
+{
+ memset(hw_fmt, 0, sizeof(sse2_qos_pol_hw_params_st));
+#if !defined (INTERNAL_SS) && !defined (X86)
+ hw_fmt->rfc = (uint8_t)bkt->rfc;
+ hw_fmt->allow_negative = (uint8_t)bkt->an;
+ hw_fmt->rate_exp = (uint8_t)bkt->rexp;
+ hw_fmt->avg_rate_man = (uint16_t)bkt->arm;
+ hw_fmt->peak_rate_man = (uint16_t)bkt->prm;
+ hw_fmt->comm_bkt_limit_man = (uint8_t)bkt->cblm;
+ hw_fmt->comm_bkt_limit_exp = (uint8_t)bkt->cble;
+ hw_fmt->extd_bkt_limit_man = (uint8_t)bkt->eblm;
+ hw_fmt->extd_bkt_limit_exp = (uint8_t)bkt->eble;
+ hw_fmt->extd_bkt = bkt->eb;
+ hw_fmt->comm_bkt = bkt->cb;
+#endif // if !defined (INTERNAL_SS) && !defined (X86)
+}
+
+/*
+ * Input: h/w programmable parameter values in 'hw'
+ * Output: configured parameter values in 'cfg'
+ * Return: Status, success or failure code.
+ */
+static cerrno
+sse2_pol_convert_hw_to_cfg_params (sse2_qos_pol_hw_params_st *hw,
+ sse2_qos_pol_cfg_params_st *cfg)
+{
+ uint64_t temp_rate;
+
+ if ((hw == NULL) || (cfg == NULL)) {
+ return EINVAL;
+ }
+
+ if ((hw->rfc == IPE_RFC_RFC4115) &&
+ !(hw->peak_rate_man << hw->rate_exp) &&
+ !(hw->extd_bkt_limit_man)) {
+ /*
+ * For a 1R2C, we set EIR = 0, EB = 0
+ */
+ cfg->rfc = SSE2_QOS_POLICER_TYPE_1R2C;
+ } else if (hw->rfc == IPE_RFC_RFC2697) {
+ cfg->rfc = SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697;
+ } else if (hw->rfc == IPE_RFC_RFC2698) {
+ cfg->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698;
+ } else if (hw->rfc == IPE_RFC_RFC4115) {
+ cfg->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115;
+ } else if (hw->rfc == IPE_RFC_MEF5CF1) {
+ cfg->rfc = SSE2_QOS_POLICER_TYPE_2R3C_RFC_MEF5CF1;
+ } else {
+ return EINVAL;
+ }
+
+ temp_rate = ((hw->avg_rate_man << hw->rate_exp) * 8LL *
+ SSE2_QOS_POL_TICKS_PER_SEC)/1000;
+ cfg->rb.kbps.cir_kbps = (uint32_t)temp_rate;
+
+ temp_rate = ((hw->peak_rate_man << hw->rate_exp) * 8LL *
+ SSE2_QOS_POL_TICKS_PER_SEC)/1000;
+ cfg->rb.kbps.eir_kbps = (uint32_t)temp_rate;
+
+ cfg->rb.kbps.cb_bytes = ((uint64_t)hw->comm_bkt_limit_man <<
+ (uint64_t)hw->comm_bkt_limit_exp);
+ cfg->rb.kbps.eb_bytes = ((uint64_t)hw->extd_bkt_limit_man <<
+ (uint64_t)hw->extd_bkt_limit_exp);
+
+ if (cfg->rfc == SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697) {
+ /*
+ * For 1R3C in the hardware, EB = sum(CB, EB). Also, EIR = CIR. Restore
+ * values such that the configured params don't reflect this adjustment
+ */
+ cfg->rb.kbps.eb_bytes = (cfg->rb.kbps.eb_bytes -
+ cfg->rb.kbps.cb_bytes);
+ cfg->rb.kbps.eir_kbps = 0;
+ } else if (cfg->rfc == SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115) {
+ /*
+ * For 4115 in the hardware is excess rate and burst, but EA provides
+ * peak-rate, so adjust it to be eir
+ */
+ cfg->rb.kbps.eir_kbps += cfg->rb.kbps.cir_kbps;
+ cfg->rb.kbps.eb_bytes += cfg->rb.kbps.cb_bytes;
+ }
+ /* h/w conversion to cfg is in kbps */
+ cfg->rate_type = SSE2_QOS_RATE_KBPS;
+ cfg->overwrite_bucket = 0;
+ cfg->current_bucket = hw->comm_bkt;
+ cfg->extended_bucket = hw->extd_bkt;
+
+ SSE2_QOS_DEBUG_INFO("configured params, cir: %u kbps, eir: %u kbps, cb "
+ "burst: 0x%llx bytes, eb burst: 0x%llx bytes",
+ cfg->rb.kbps.cir_kbps, cfg->rb.kbps.eir_kbps,
+ cfg->rb.kbps.cb_bytes, cfg->rb.kbps.eb_bytes);
+ SSE2_QOS_TR_INFO(SSE2_QOS_TP_INFO_22, cfg->rb.kbps.cir_kbps,
+ cfg->rb.kbps.eir_kbps,
+ (uint)cfg->rb.kbps.cb_bytes, (uint)cfg->rb.kbps.eb_bytes);
+
+ return EOK;
+}
+
+uint32_t
+sse2_qos_convert_kbps_to_pps (uint32_t rate_kbps)
+{
+ uint64_t numer, denom, rnd_value = 0;
+
+ // sse_qosrm_ship_inc_counter(SSE2_QOS_SHIP_COUNTER_TYPE_API_CNT,
+ // SSE2_QOS_SHIP_CNT_POL_CONV_KBPS_TO_PPS);
+
+ numer = (uint64_t)((uint64_t)rate_kbps * 1000LL);
+ denom = (uint64_t)((uint64_t)SSE2_QOS_POLICER_FIXED_PKT_SIZE * 8LL);
+
+ (void)sse2_qos_pol_round(numer, denom, &rnd_value,
+ SSE2_QOS_ROUND_TO_CLOSEST);
+
+ return((uint32_t)rnd_value);
+}
+
+uint32_t
+sse2_qos_convert_burst_bytes_to_ms (uint64_t burst_bytes,
+ uint32_t rate_kbps)
+{
+ uint64_t numer, denom, rnd_value = 0;
+
+ //sse_qosrm_ship_inc_counter(SSE2_QOS_SHIP_COUNTER_TYPE_API_CNT,
+ // SSE2_QOS_SHIP_CNT_POL_CONV_BYTES_TO_BURST_MS);
+
+ numer = burst_bytes * 8LL;
+ denom = (uint64_t)rate_kbps;
+
+ (void)sse2_qos_pol_round(numer, denom, &rnd_value,
+ SSE2_QOS_ROUND_TO_CLOSEST);
+
+ return((uint32_t)rnd_value);
+}
+
+/*
+ * Input: physical structure in 'phys', rate_type in cfg
+ * Output: configured parameters in 'cfg'.
+ * Return: Status, success or failure code.
+ */
+trans_layer_rc
+sse2_pol_physical_2_logical (policer_read_response_type_st *phys,
+ sse2_qos_pol_cfg_params_st *cfg)
+{
+ cerrno rc;
+ sse2_qos_pol_hw_params_st pol_hw;
+ sse2_qos_pol_cfg_params_st kbps_cfg;
+
+ memset(&pol_hw, 0, sizeof(sse2_qos_pol_hw_params_st));
+ memset(&kbps_cfg, 0, sizeof(sse2_qos_pol_cfg_params_st));
+
+ if (!phys) {
+ SSE2_QOS_DEBUG_ERROR("Illegal parameters");
+ return(Not_OK);
+ }
+
+ sse2_qos_convert_pol_bucket_to_hw_fmt (phys, &pol_hw);
+
+ rc = sse2_pol_convert_hw_to_cfg_params(&pol_hw, &kbps_cfg);
+ if (CERR_IS_NOTOK(rc)) {
+ SSE2_QOS_DEBUG_ERROR("Unable to convert hw params to config params. "
+ "Error: %d", rc);
+ return(Not_OK);
+ }
+
+ /* check what rate type is required */
+ switch (cfg->rate_type) {
+ case SSE2_QOS_RATE_KBPS:
+ /* copy all the data into kbps_cfg */
+ cfg->rb.kbps.cir_kbps = kbps_cfg.rb.kbps.cir_kbps;
+ cfg->rb.kbps.eir_kbps = kbps_cfg.rb.kbps.eir_kbps;
+ cfg->rb.kbps.cb_bytes = kbps_cfg.rb.kbps.cb_bytes;
+ cfg->rb.kbps.eb_bytes = kbps_cfg.rb.kbps.eb_bytes;
+ break;
+ case SSE2_QOS_RATE_PPS:
+ cfg->rb.pps.cir_pps =
+ sse2_qos_convert_kbps_to_pps(kbps_cfg.rb.kbps.cir_kbps);
+ cfg->rb.pps.eir_pps =
+ sse2_qos_convert_kbps_to_pps(kbps_cfg.rb.kbps.eir_kbps);
+ cfg->rb.pps.cb_ms = sse2_qos_convert_burst_bytes_to_ms(
+ kbps_cfg.rb.kbps.cb_bytes, kbps_cfg.rb.kbps.cir_kbps);
+ cfg->rb.pps.eb_ms = sse2_qos_convert_burst_bytes_to_ms(
+ kbps_cfg.rb.kbps.eb_bytes, kbps_cfg.rb.kbps.eir_kbps);
+ break;
+ default:
+ SSE2_QOS_DEBUG_ERROR("Illegal rate type");
+ return(Not_OK);
+ }
+
+ /* cfg->rate_type remains what it was */
+ cfg->rnd_type = kbps_cfg.rnd_type;
+ cfg->rfc = kbps_cfg.rfc;
+ cfg->overwrite_bucket = kbps_cfg.overwrite_bucket;
+ cfg->current_bucket = kbps_cfg.current_bucket;
+ cfg->extended_bucket = kbps_cfg.extended_bucket;
+
+ return OK_pushHW;
+}
diff --git a/vnet/vnet/policer/xlate.h b/vnet/vnet/policer/xlate.h
new file mode 100644
index 00000000000..b41666c897e
--- /dev/null
+++ b/vnet/vnet/policer/xlate.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*---------------------------------------------------------------------------
+ * from gdp_logical_qos.h
+ *---------------------------------------------------------------------------
+ */
+
+#ifndef __included_xlate_h__
+#define __included_xlate_h__
+
+#include <vnet/policer/fix_types.h>
+#include <vnet/policer/police.h>
+
+/*
+ * edt: * enum sse2_qos_policer_type_en
+ * Defines type of policer to be allocated
+ */
+typedef enum sse2_qos_policer_type_en_ {
+ SSE2_QOS_POLICER_TYPE_1R2C = 0,
+ SSE2_QOS_POLICER_TYPE_1R3C_RFC_2697 = 1,
+ SSE2_QOS_POLICER_TYPE_2R3C_RFC_2698 = 2,
+ SSE2_QOS_POLICER_TYPE_2R3C_RFC_4115 = 3,
+ SSE2_QOS_POLICER_TYPE_2R3C_RFC_MEF5CF1 = 4,
+ SSE2_QOS_POLICER_TYPE_MAX
+} sse2_qos_policer_type_en;
+
+/*
+ * edt: * enum
+ * Enum used to define type of rounding used when calculating policer values
+ */
+typedef enum {
+ SSE2_QOS_ROUND_TO_CLOSEST = 0,
+ SSE2_QOS_ROUND_TO_UP,
+ SSE2_QOS_ROUND_TO_DOWN,
+ SSE2_QOS_ROUND_INVALID
+} sse2_qos_round_type_en;
+
+/*
+ * edt: * enum
+ * Enum used to define type of rate for configuration, either pps or kbps.
+ * If kbps, then burst is in bytes, if pps, then burst is in ms.
+ *
+ * Default of zero is kbps, which is inline with how it is programmed
+ * in actual hardware. However, the warning is that this is reverse logic
+ * of units_in_bits field in sse2_static_policer_parameters_st, which is
+ * inline with sse_punt_drop.h.
+ */
+typedef enum {
+ SSE2_QOS_RATE_KBPS = 0,
+ SSE2_QOS_RATE_PPS,
+ SSE2_QOS_RATE_INVALID
+} sse2_qos_rate_type_en;
+
+/*
+ * edt: * struct sse2_qos_pol_cfg_params_st
+ *
+ * Description:
+ * This structure is used to hold user configured policing parameters.
+ *
+ * element: cir_kbps
+ * CIR in kbps.
+ * element: eir_kbps
+ * EIR or PIR in kbps.
+ * element: cb_bytes
+ * Committed Burst in bytes.
+ * element: eb_bytes
+ * Excess or Peak Burst in bytes.
+ * element: cir_pps
+ * CIR in pps.
+ * element: eir_pps
+ * EIR or PIR in pps.
+ * element: cb_ms
+ * Committed Burst in milliseconds.
+ * element: eb_ms
+ * Excess or Peak Burst in milliseconds.
+ * element: rate_type
+ * Indicates the union if in kbps/bytes or pps/ms.
+ * element: rfc
+ * Policer algorithm - 1R2C, 1R3C (2697), 2R3C (2698) or 2R3C (4115). See
+ * sse_qos_policer_type_en
+ * element: rnd_type
+ * Rounding type (see sse_qos_round_type_en). Needed when policer values
+ * need to be rounded. Caller can decide on type of rounding used
+ */
+typedef struct sse2_qos_pol_cfg_params_st_ {
+ union {
+ struct {
+ uint32_t cir_kbps;
+ uint32_t eir_kbps;
+ uint64_t cb_bytes;
+ uint64_t eb_bytes;
+ } PACKED kbps;
+ struct {
+ uint32_t cir_pps;
+ uint32_t eir_pps;
+ uint64_t cb_ms;
+ uint64_t eb_ms;
+ } PACKED pps;
+ } PACKED rb; /* rate burst config */
+ uint8_t rate_type; /* sse2_qos_rate_type_en */
+ uint8_t rnd_type; /* sse2_qos_round_type_en */
+ uint8_t rfc; /* sse2_qos_policer_type_en */
+ uint8_t overwrite_bucket; /* for debugging purposes */
+ uint32_t current_bucket; /* for debugging purposes */
+ uint32_t extended_bucket; /* for debugging purposes */
+} sse2_qos_pol_cfg_params_st;
+
+
+typedef struct sse2_qos_pol_hw_params_st_ {
+ uint8_t rfc;
+ uint8_t allow_negative;
+ uint8_t rate_exp;
+ uint16_t avg_rate_man;
+ uint16_t peak_rate_man;
+ uint8_t comm_bkt_limit_exp;
+ uint8_t comm_bkt_limit_man;
+ uint8_t extd_bkt_limit_exp;
+ uint8_t extd_bkt_limit_man;
+ uint32_t comm_bkt;
+ uint32_t extd_bkt;
+} sse2_qos_pol_hw_params_st;
+
+
+trans_layer_rc
+sse2_pol_logical_2_physical (sse2_qos_pol_cfg_params_st *cfg,
+ policer_read_response_type_st *phys);
+
+
+#endif /* __included_xlate_h__ */
diff --git a/vnet/vnet/ppp/error.def b/vnet/vnet/ppp/error.def
new file mode 100644
index 00000000000..ba645408582
--- /dev/null
+++ b/vnet/vnet/ppp/error.def
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ppp_error.def: ppp errors
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+ppp_error (NONE, "no error")
+ppp_error (UNKNOWN_PROTOCOL, "unknown ppp protocol")
+ppp_error (UNKNOWN_ADDRESS_CONTROL, "address, control != 0xff03")
diff --git a/vnet/vnet/ppp/node.c b/vnet/vnet/ppp/node.c
new file mode 100644
index 00000000000..4f813732df2
--- /dev/null
+++ b/vnet/vnet/ppp/node.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ppp_node.c: ppp packet processing
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ppp/ppp.h>
+#include <vppinfra/sparse_vec.h>
+
+#define foreach_ppp_input_next \
+ _ (PUNT, "error-punt") \
+ _ (DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) PPP_INPUT_NEXT_##s,
+ foreach_ppp_input_next
+#undef _
+ PPP_INPUT_N_NEXT,
+} ppp_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} ppp_input_trace_t;
+
+static u8 * format_ppp_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ ppp_input_trace_t * t = va_arg (*va, ppp_input_trace_t *);
+
+ s = format (s, "%U", format_ppp_header, t->packet_data);
+
+ return s;
+}
+
+typedef struct {
+ /* Sparse vector mapping ppp protocol in network byte order
+ to next index. */
+ u16 * next_by_protocol;
+
+ u32 * sparse_index_by_next_index;
+} ppp_input_runtime_t;
+
+static uword
+ppp_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ ppp_input_runtime_t * rt = (void *) node->runtime_data;
+ u32 n_left_from, next_index, i_next, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (ppp_input_trace_t));
+
+ next_index = node->cached_next_index;
+ i_next = vec_elt (rt->sparse_index_by_next_index, next_index);
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ppp_header_t * h0, * h1;
+ u32 i0, i1, protocol0, protocol1, enqueue_code;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = (void *) (b0->data + b0->current_data);
+ h1 = (void *) (b1->data + b1->current_data);
+
+ b0->current_data += sizeof (h0[0]);
+ b1->current_data += sizeof (h1[0]);
+
+ b0->current_length -= sizeof (h0[0]);
+ b1->current_length -= sizeof (h1[0]);
+
+ /* Index sparse array with network byte order. */
+ protocol0 = h0->protocol;
+ protocol1 = h1->protocol;
+ sparse_vec_index2 (rt->next_by_protocol, protocol0, protocol1, &i0, &i1);
+
+ b0->error = node->errors[i0 == SPARSE_VEC_INVALID_INDEX ? PPP_ERROR_UNKNOWN_PROTOCOL : PPP_ERROR_NONE];
+ b1->error = node->errors[i1 == SPARSE_VEC_INVALID_INDEX ? PPP_ERROR_UNKNOWN_PROTOCOL : PPP_ERROR_NONE];
+
+ enqueue_code = (i0 != i_next) + 2*(i1 != i_next);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = bi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i0), bi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i1), bi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i0), bi0);
+ vlib_set_next_frame_buffer (vm, node, vec_elt (rt->next_by_protocol, i1), bi1);
+ if (i0 == i1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ i_next = i1;
+ next_index = vec_elt (rt->next_by_protocol, i_next);
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ppp_header_t * h0;
+ u32 i0, protocol0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ b0->current_data += sizeof (h0[0]);
+ b0->current_length -= sizeof (h0[0]);
+
+ protocol0 = h0->protocol;
+ i0 = sparse_vec_index (rt->next_by_protocol, protocol0);
+
+ b0->error = node->errors[i0 == SPARSE_VEC_INVALID_INDEX ? PPP_ERROR_UNKNOWN_PROTOCOL : PPP_ERROR_NONE];
+
+ /* Sent packet to wrong next? */
+ if (PREDICT_FALSE (i0 != i_next))
+ {
+ /* Return old frame; remove incorrectly enqueued packet. */
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);
+
+ /* Send to correct next. */
+ i_next = i0;
+ next_index = vec_elt (rt->next_by_protocol, i_next);
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * ppp_error_strings[] = {
+#define ppp_error(n,s) s,
+#include "error.def"
+#undef ppp_error
+};
+
+VLIB_REGISTER_NODE (ppp_input_node) = {
+ .function = ppp_input,
+ .name = "ppp-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .runtime_data_bytes = sizeof (ppp_input_runtime_t),
+
+ .n_errors = PPP_N_ERROR,
+ .error_strings = ppp_error_strings,
+
+ .n_next_nodes = PPP_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [PPP_INPUT_NEXT_##s] = n,
+ foreach_ppp_input_next
+#undef _
+ },
+
+ .format_buffer = format_ppp_header_with_length,
+ .format_trace = format_ppp_input_trace,
+ .unformat_buffer = unformat_ppp_header,
+};
+
+static clib_error_t * ppp_input_init (vlib_main_t * vm)
+{
+ ppp_input_runtime_t * rt;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, ppp_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ ppp_setup_node (vm, ppp_input_node.index);
+
+ rt = vlib_node_get_runtime_data (vm, ppp_input_node.index);
+
+ rt->next_by_protocol = sparse_vec_new
+ (/* elt bytes */ sizeof (rt->next_by_protocol[0]),
+ /* bits in index */ BITS (((ppp_header_t *) 0)->protocol));
+
+ vec_validate (rt->sparse_index_by_next_index, PPP_INPUT_NEXT_DROP);
+ vec_validate (rt->sparse_index_by_next_index, PPP_INPUT_NEXT_PUNT);
+ rt->sparse_index_by_next_index[PPP_INPUT_NEXT_DROP]
+ = SPARSE_VEC_INVALID_INDEX;
+ rt->sparse_index_by_next_index[PPP_INPUT_NEXT_PUNT]
+ = SPARSE_VEC_INVALID_INDEX;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (ppp_input_init);
+
+void
+ppp_register_input_protocol (vlib_main_t * vm,
+ ppp_protocol_t protocol,
+ u32 node_index)
+{
+ ppp_main_t * em = &ppp_main;
+ ppp_protocol_info_t * pi;
+ ppp_input_runtime_t * rt;
+ u16 * n;
+ u32 i;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, ppp_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ pi = ppp_get_protocol_info (em, protocol);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ ppp_input_node.index,
+ node_index);
+
+ /* Setup ppp protocol -> next index sparse vector mapping. */
+ rt = vlib_node_get_runtime_data (vm, ppp_input_node.index);
+ n = sparse_vec_validate (rt->next_by_protocol, clib_host_to_net_u16 (protocol));
+ n[0] = pi->next_index;
+
+ /* Rebuild next index -> sparse index inverse mapping when sparse vector
+ is updated. */
+ vec_validate (rt->sparse_index_by_next_index, pi->next_index);
+ for (i = 1; i < vec_len (rt->next_by_protocol); i++)
+ rt->sparse_index_by_next_index[rt->next_by_protocol[i]] = i;
+}
diff --git a/vnet/vnet/ppp/packet.h b/vnet/vnet/ppp/packet.h
new file mode 100644
index 00000000000..da034daecfa
--- /dev/null
+++ b/vnet/vnet/ppp/packet.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_ppp_packet_h
+#define included_vnet_ppp_packet_h
+
+/*
+ * PPP packet format
+ *
+ * Copyright (c) 2009 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+See http://www.iana.org/assignments/ppp-numbers.
+
+The Point-to-Point Protocol (PPP) Data Link Layer [146,147,175]
+contains a 16 bit Protocol field to identify the the encapsulated
+protocol. The Protocol field is consistent with the ISO 3309 (HDLC)
+extension mechanism for Address fields. All Protocols MUST be
+assigned such that the least significant bit of the most significant
+octet equals "0", and the least significant bit of the least
+significant octet equals "1".
+*/
+
+#define foreach_ppp_protocol \
+_ (0x0001, padding) \
+_ (0x0003, rohc_small_cid) \
+_ (0x0005, rohc_large_cid) \
+_ (0x0021, ip4) \
+_ (0x0023, osi) \
+_ (0x0025, xerox_ns_idp) \
+_ (0x0027, decnet) \
+_ (0x0029, appletalk) \
+_ (0x002b, ipx) \
+_ (0x002d, vj_compressed_tcp) \
+_ (0x002f, vj_uncompressed_tcp) \
+_ (0x0031, bpdu) \
+_ (0x0033, streams) \
+_ (0x0035, vines) \
+_ (0x0039, appletalk_eddp) \
+_ (0x003b, appletalk_smart_buffered) \
+_ (0x003d, multilink) \
+_ (0x003f, netbios_framing) \
+_ (0x0041, cisco) \
+_ (0x0043, timeplex) \
+_ (0x0045, fujitsu_lblb) \
+_ (0x0047, dca_remote_lan) \
+_ (0x0049, sdtp) \
+_ (0x004b, sna_over_802_2) \
+_ (0x004d, sna) \
+_ (0x004f, ip6_header_compression) \
+_ (0x0051, knx) \
+_ (0x0053, encryption) \
+_ (0x0055, link_encryption) \
+_ (0x0057, ip6) \
+_ (0x0059, ppp_mux) \
+_ (0x005b, vendor_specific_a) \
+_ (0x0061, rtp_iphc_full_header) \
+_ (0x0063, rtp_iphc_compressed_tcp) \
+_ (0x0065, rtp_iphc_compressed_non_tcp) \
+_ (0x0067, rtp_iphc_compressed_udp_8) \
+_ (0x0069, rtp_iphc_compressed_rtp_8) \
+_ (0x006f, stampede) \
+_ (0x0073, mp_plus) \
+_ (0x007d, control) \
+_ (0x00c1, ntcits_ipi) \
+_ (0x00cf, ppp_nlpid) \
+_ (0x00fb, multilink_compression) \
+_ (0x00fd, compressed_datagram) \
+_ (0x0201, 802_1d_hello) \
+_ (0x0203, ibm_source_routing) \
+_ (0x0205, dec_lanbridge) \
+_ (0x0207, cdp) \
+_ (0x0209, netcs) \
+_ (0x020b, stp) \
+_ (0x020d, edp) \
+_ (0x0211, oscp_a) \
+_ (0x0213, oscp_b) \
+_ (0x0231, luxcom) \
+_ (0x0233, sigma) \
+_ (0x0235, apple_client_server) \
+_ (0x0281, mpls_unicast) \
+_ (0x0283, mpls_multicast) \
+_ (0x0285, ieee_p1284_4) \
+_ (0x0287, tetra) \
+_ (0x0289, multichannel_flow_treatment) \
+_ (0x2063, rtp_iphc_compressed_tcp_no_delta) \
+_ (0x2065, rtp_iphc_context_state) \
+_ (0x2067, rtp_iphc_compressed_udp_16) \
+_ (0x2069, rtp_iphc_compressed_rtp_16) \
+_ (0x4001, cray) \
+_ (0x4003, cdpd) \
+_ (0x4005, expand) \
+_ (0x4007, odsicp) \
+_ (0x4009, docsis_dll) \
+_ (0x400B, cetacean) \
+_ (0x4021, lzs) \
+_ (0x4023, reftek) \
+_ (0x4025, fibre_channel) \
+_ (0x4027, emit) \
+_ (0x405b, vendor_specific_b) \
+_ (0xc021, lcp) \
+_ (0xc023, pap) \
+_ (0xc025, link_quality_report) \
+_ (0xc027, shiva_password) \
+_ (0xc029, cbcp) \
+_ (0xc02b, bacp) \
+_ (0xc02d, bap) \
+_ (0xc05b, vendor_specific_password) \
+_ (0xc081, container_control) \
+_ (0xc223, chap) \
+_ (0xc225, rsa) \
+_ (0xc227, extensible_authentication) \
+_ (0xc229, mitsubishi_security_info) \
+_ (0xc26f, stampede_authorization) \
+_ (0xc281, proprietary_authentication_a) \
+_ (0xc283, proprietary_authentication_b) \
+_ (0xc481, proprietary_node_id_authentication)
+
+typedef enum {
+#define _(n,f) PPP_PROTOCOL_##f = n,
+ foreach_ppp_protocol
+#undef _
+} ppp_protocol_t;
+
+/* PPP Link Control Protocol (LCP) and Internet Protocol Control Protocol (IPCP) Codes
+
+The Point-to-Point Protocol (PPP) Link Control Protocol (LCP),
+the Compression Control Protocol (CCP), Internet Protocol Control
+Protocol (IPCP), and other control protocols, contain an 8 bit
+Code field which identifies the type of packet. */
+
+#define foreach_ppp_lcp_code \
+_ (0, vendor_specific) \
+_ (1, configure_request) \
+_ (2, configure_ack) \
+_ (3, configure_nak) \
+_ (4, configure_reject) \
+_ (5, terminate_request) \
+_ (6, terminate_ack) \
+_ (7, code_reject) \
+_ (8, protocol_reject) \
+_ (9, echo_request) \
+_ (10, echo_reply) \
+_ (11, discard_request) \
+_ (12, identification) \
+_ (13, time_remaining) \
+_ (14, reset_request) \
+_ (15, reset_reply)
+
+typedef struct {
+ /* Set to 0xff 0x03 */
+ u8 address, control;
+
+ /* Layer 3 protocol for this packet. */
+ u16 protocol;
+} ppp_header_t;
+
+#endif /* included_vnet_ppp_packet_h */
diff --git a/vnet/vnet/ppp/pg.c b/vnet/vnet/ppp/pg.c
new file mode 100644
index 00000000000..2662bdc1999
--- /dev/null
+++ b/vnet/vnet/ppp/pg.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ppp_pg.c: packet generator ppp interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/ppp/ppp.h>
+
+typedef struct {
+ pg_edit_t address;
+ pg_edit_t control;
+ pg_edit_t protocol;
+} pg_ppp_header_t;
+
+static inline void
+pg_ppp_header_init (pg_ppp_header_t * e)
+{
+ pg_edit_init (&e->address, ppp_header_t, address);
+ pg_edit_init (&e->control, ppp_header_t, control);
+ pg_edit_init (&e->protocol, ppp_header_t, protocol);
+}
+
+uword
+unformat_pg_ppp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_ppp_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (ppp_header_t),
+ &group_index);
+ pg_ppp_header_init (h);
+
+ pg_edit_set_fixed (&h->address, 0xff);
+ pg_edit_set_fixed (&h->control, 0x03);
+
+ error = 1;
+ if (! unformat (input, "%U",
+ unformat_pg_edit,
+ unformat_ppp_protocol_net_byte_order, &h->protocol))
+ goto done;
+
+ {
+ ppp_main_t * pm = &ppp_main;
+ ppp_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->protocol.type == PG_EDIT_FIXED)
+ {
+ u16 t = *(u16 *) h->protocol.values[PG_EDIT_LO];
+ pi = ppp_get_protocol_info (pm, clib_net_to_host_u16 (t));
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/ppp/ppp.c b/vnet/vnet/ppp/ppp.c
new file mode 100644
index 00000000000..1537c9a50c3
--- /dev/null
+++ b/vnet/vnet/ppp/ppp.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ppp.c: ppp support
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ppp/ppp.h>
+
+/* Global main structure. */
+ppp_main_t ppp_main;
+
+u8 * format_ppp_protocol (u8 * s, va_list * args)
+{
+ ppp_protocol_t p = va_arg (*args, u32);
+ ppp_main_t * pm = &ppp_main;
+ ppp_protocol_info_t * pi = ppp_get_protocol_info (pm, p);
+
+ if (pi)
+ s = format (s, "%s", pi->name);
+ else
+ s = format (s, "0x%04x", p);
+
+ return s;
+}
+
+u8 * format_ppp_header_with_length (u8 * s, va_list * args)
+{
+ ppp_main_t * pm = &ppp_main;
+ ppp_header_t * h = va_arg (*args, ppp_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ ppp_protocol_t p = clib_net_to_host_u16 (h->protocol);
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "ppp header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "PPP %U", format_ppp_protocol, p);
+
+ if (h->address != 0xff)
+ s = format (s, ", address 0x%02x", h->address);
+ if (h->control != 0x03)
+ s = format (s, ", control 0x%02x", h->control);
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ {
+ ppp_protocol_info_t * pi = ppp_get_protocol_info (pm, p);
+ vlib_node_t * node = vlib_get_node (pm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_ppp_header (u8 * s, va_list * args)
+{
+ ppp_header_t * h = va_arg (*args, ppp_header_t *);
+ return format (s, "%U", format_ppp_header_with_length, h, 0);
+}
+
+/* Returns ppp protocol as an int in host byte order. */
+uword
+unformat_ppp_protocol_host_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ ppp_main_t * pm = &ppp_main;
+ int p, i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x", &p)
+ || unformat (input, "%d", &p))
+ {
+ if (p >= (1 << 16))
+ return 0;
+ *result = p;
+ return 1;
+ }
+
+ /* Named type. */
+ if (unformat_user (input, unformat_vlib_number_by_name,
+ pm->protocol_info_by_name, &i))
+ {
+ ppp_protocol_info_t * pi = vec_elt_at_index (pm->protocol_infos, i);
+ *result = pi->protocol;
+ return 1;
+ }
+
+ return 0;
+}
+
+uword
+unformat_ppp_protocol_net_byte_order (unformat_input_t * input,
+ va_list * args)
+{
+ u16 * result = va_arg (*args, u16 *);
+ if (! unformat_user (input, unformat_ppp_protocol_host_byte_order, result))
+ return 0;
+ *result = clib_host_to_net_u16 ((u16) *result);
+ return 1;
+}
+
+uword
+unformat_ppp_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ ppp_header_t _h, * h = &_h;
+ u16 p;
+
+ if (! unformat (input, "%U",
+ unformat_ppp_protocol_host_byte_order, &p))
+ return 0;
+
+ h->address = 0xff;
+ h->control = 0x03;
+ h->protocol = clib_host_to_net_u16 (p);
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static uword ppp_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ ppp_header_t * h = rewrite;
+ ppp_protocol_t protocol;
+
+ if (max_rewrite_bytes < sizeof (h[0]))
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: protocol = PPP_PROTOCOL_##b; break
+ _ (IP4, ip4);
+ _ (IP6, ip6);
+ _ (MPLS_UNICAST, mpls_unicast);
+ _ (MPLS_MULTICAST, mpls_multicast);
+#undef _
+ default:
+ return 0;
+ }
+
+ h->address = 0xff;
+ h->control = 0x03;
+ h->protocol = clib_host_to_net_u16 (protocol);
+
+ return sizeof (h[0]);
+}
+
+VNET_HW_INTERFACE_CLASS (ppp_hw_interface_class) = {
+ .name = "PPP",
+ .format_header = format_ppp_header_with_length,
+ .unformat_header = unformat_ppp_header,
+ .set_rewrite = ppp_set_rewrite,
+};
+
+static void add_protocol (ppp_main_t * pm,
+ ppp_protocol_t protocol,
+ char * protocol_name)
+{
+ ppp_protocol_info_t * pi;
+ u32 i;
+
+ vec_add2 (pm->protocol_infos, pi, 1);
+ i = pi - pm->protocol_infos;
+
+ pi->name = protocol_name;
+ pi->protocol = protocol;
+ pi->next_index = pi->node_index = ~0;
+
+ hash_set (pm->protocol_info_by_protocol, protocol, i);
+ hash_set_mem (pm->protocol_info_by_name, pi->name, i);
+}
+
+static clib_error_t * ppp_init (vlib_main_t * vm)
+{
+ ppp_main_t * pm = &ppp_main;
+
+ memset (pm, 0, sizeof (pm[0]));
+ pm->vlib_main = vm;
+
+ pm->protocol_info_by_name = hash_create_string (0, sizeof (uword));
+ pm->protocol_info_by_protocol = hash_create (0, sizeof (uword));
+
+#define _(n,s) add_protocol (pm, PPP_PROTOCOL_##s, #s);
+ foreach_ppp_protocol;
+#undef _
+
+ return vlib_call_init_function (vm, ppp_input_init);
+}
+
+VLIB_INIT_FUNCTION (ppp_init);
+
+ppp_main_t * ppp_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, ppp_init);
+ return &ppp_main;
+}
+
diff --git a/vnet/vnet/ppp/ppp.h b/vnet/vnet/ppp/ppp.h
new file mode 100644
index 00000000000..e512df7f108
--- /dev/null
+++ b/vnet/vnet/ppp/ppp.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * ppp.h: types/functions for ppp.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_ppp_h
+#define included_ppp_h
+
+#include <vnet/vnet.h>
+#include <vnet/ppp/packet.h>
+#include <vnet/pg/pg.h>
+
+vnet_hw_interface_class_t ppp_hw_interface_class;
+
+typedef enum {
+#define ppp_error(n,s) PPP_ERROR_##n,
+#include <vnet/ppp/error.def>
+#undef ppp_error
+ PPP_N_ERROR,
+} ppp_error_t;
+
+typedef struct {
+ /* Name (a c string). */
+ char * name;
+
+ /* PPP protocol type in host byte order. */
+ ppp_protocol_t protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* Next index for this type. */
+ u32 next_index;
+} ppp_protocol_info_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ ppp_protocol_info_t * protocol_infos;
+
+ /* Hash tables mapping name/protocol to protocol info index. */
+ uword * protocol_info_by_name, * protocol_info_by_protocol;
+} ppp_main_t;
+
+always_inline ppp_protocol_info_t *
+ppp_get_protocol_info (ppp_main_t * em, ppp_protocol_t protocol)
+{
+ uword * p = hash_get (em->protocol_info_by_protocol, protocol);
+ return p ? vec_elt_at_index (em->protocol_infos, p[0]) : 0;
+}
+
+extern ppp_main_t ppp_main;
+
+/* Register given node index to take input for given ppp type. */
+void
+ppp_register_input_type (vlib_main_t * vm,
+ ppp_protocol_t protocol,
+ u32 node_index);
+
+void ppp_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ ppp_protocol_t protocol);
+
+format_function_t format_ppp_protocol;
+format_function_t format_ppp_header;
+format_function_t format_ppp_header_with_length;
+
+/* Parse ppp protocol as 0xXXXX or protocol name.
+ In either host or network byte order. */
+unformat_function_t unformat_ppp_protocol_host_byte_order;
+unformat_function_t unformat_ppp_protocol_net_byte_order;
+
+/* Parse ppp header. */
+unformat_function_t unformat_ppp_header;
+unformat_function_t unformat_pg_ppp_header;
+
+always_inline void
+ppp_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_ppp_header_with_length;
+ n->unformat_buffer = unformat_ppp_header;
+ pn->unformat_edit = unformat_pg_ppp_header;
+}
+
+void
+ppp_register_input_protocol (vlib_main_t * vm,
+ ppp_protocol_t protocol,
+ u32 node_index);
+
+#endif /* included_ppp_h */
diff --git a/vnet/vnet/replication.c b/vnet/vnet/replication.c
new file mode 100644
index 00000000000..6842684984d
--- /dev/null
+++ b/vnet/vnet/replication.c
@@ -0,0 +1,276 @@
+/*
+ * replication.c : packet replication
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/replication.h>
+
+
+replication_main_t replication_main;
+
+
+replication_context_t *
+replication_prep (vlib_main_t * vm,
+ vlib_buffer_t * b0,
+ u32 recycle_node_index,
+ u32 l2_packet)
+{
+ replication_main_t * rm = &replication_main;
+ replication_context_t * ctx;
+ uword cpu_number = vm->cpu_index;
+ ip4_header_t * ip;
+ u32 ctx_id;
+
+ // Allocate a context, reserve context 0
+ if (PREDICT_FALSE(rm->contexts[cpu_number] == 0))
+ pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES);
+
+ pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES);
+ ctx_id = ctx - rm->contexts[cpu_number];
+
+ // Save state from vlib buffer
+ ctx->saved_clone_count = b0->clone_count;
+ ctx->saved_free_list_index = b0->free_list_index;
+ ctx->current_data = b0->current_data;
+
+ // Set up vlib buffer hooks
+ b0->clone_count = ctx_id;
+ b0->free_list_index = rm->recycle_list_index;
+
+ // Save feature state
+ ctx->recycle_node_index = recycle_node_index;
+
+ // Save vnet state
+ memcpy (ctx->vnet_buffer, vnet_buffer(b0), sizeof(vnet_buffer_opaque_t));
+
+ // Save packet contents
+ ctx->l2_packet = l2_packet;
+ ip = (ip4_header_t *)vlib_buffer_get_current (b0);
+ if (l2_packet) {
+ // Save ethernet header
+ ctx->l2_header[0] = ((u64 *)ip)[0];
+ ctx->l2_header[1] = ((u64 *)ip)[1];
+ ctx->l2_header[2] = ((u64 *)ip)[2];
+ // set ip to the true ip header
+ ip = (ip4_header_t *)(((u8 *)ip) + vnet_buffer(b0)->l2.l2_len);
+ }
+
+ // Copy L3 fields.
+ // We need to save TOS for ip4 and ip6 packets. Fortunately the TOS field is
+ // in the first two bytes of both the ip4 and ip6 headers.
+ ctx->ip_tos = *((u16 *)(ip));
+
+ // Save the ip4 checksum as well. We just blindly save the corresponding two
+ // bytes even for ip6 packets.
+ ctx->ip4_checksum = ip->checksum;
+
+ return ctx;
+}
+
+
+replication_context_t *
+replication_recycle (vlib_main_t * vm,
+ vlib_buffer_t * b0,
+ u32 is_last)
+{
+ replication_main_t * rm = &replication_main;
+ replication_context_t * ctx;
+ uword cpu_number = vm->cpu_index;
+ ip4_header_t * ip;
+
+ // Get access to the replication context
+ ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->clone_count);
+
+ // Restore vnet buffer state
+ memcpy (vnet_buffer(b0), ctx->vnet_buffer, sizeof(vnet_buffer_opaque_t));
+
+ // Restore the packet start (current_data) and length
+ vlib_buffer_advance(b0, ctx->current_data - b0->current_data);
+
+ // Restore packet contents
+ ip = (ip4_header_t *)vlib_buffer_get_current (b0);
+ if (ctx->l2_packet) {
+ // Restore ethernet header
+ ((u64 *)ip)[0] = ctx->l2_header[0];
+ ((u64 *)ip)[1] = ctx->l2_header[1];
+ ((u64 *)ip)[2] = ctx->l2_header[2];
+ // set ip to the true ip header
+ ip = (ip4_header_t *)(((u8 *)ip) + vnet_buffer(b0)->l2.l2_len);
+ }
+
+ // Restore L3 fields
+ *((u16 *)(ip)) = ctx->ip_tos;
+ ip->checksum = ctx->ip4_checksum;
+
+ if (is_last) {
+ // This is the last replication in the list.
+ // Restore original buffer free functionality.
+ b0->clone_count = ctx->saved_clone_count;
+ b0->free_list_index = ctx->saved_free_list_index;
+
+ // Free context back to its pool
+ pool_put (rm->contexts[cpu_number], ctx);
+ }
+
+ return ctx;
+}
+
+
+
+/*
+ * fish pkts back from the recycle queue/freelist
+ * un-flatten the context chains
+ */
+static void replication_recycle_callback (vlib_main_t *vm,
+ vlib_buffer_free_list_t * fl)
+{
+ vlib_frame_t * f = 0;
+ u32 n_left_from;
+ u32 n_left_to_next = 0;
+ u32 n_this_frame = 0;
+ u32 * from;
+ u32 * to_next = 0;
+ u32 bi0, pi0;
+ vlib_buffer_t *b0;
+ vlib_buffer_t *bnext0;
+ int i;
+ replication_main_t * rm = &replication_main;
+ replication_context_t * ctx;
+ u32 feature_node_index = 0;
+ uword cpu_number = vm->cpu_index;
+
+ // All buffers in the list are destined to the same recycle node.
+ // Pull the recycle node index from the first buffer.
+ // Note: this could be sped up if the node index were stuffed into
+ // the freelist itself.
+ if (vec_len (fl->aligned_buffers) > 0) {
+ bi0 = fl->aligned_buffers[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ ctx = pool_elt_at_index (rm->contexts[cpu_number],
+ b0->clone_count);
+ feature_node_index = ctx->recycle_node_index;
+ } else if (vec_len (fl->unaligned_buffers) > 0) {
+ bi0 = fl->unaligned_buffers[0];
+ b0 = vlib_get_buffer (vm, bi0);
+ ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->clone_count);
+ feature_node_index = ctx->recycle_node_index;
+ }
+
+ /* aligned, unaligned buffers */
+ for (i = 0; i < 2; i++)
+ {
+ if (i == 0)
+ {
+ from = fl->aligned_buffers;
+ n_left_from = vec_len (from);
+ }
+ else
+ {
+ from = fl->unaligned_buffers;
+ n_left_from = vec_len (from);
+ }
+
+ while (n_left_from > 0)
+ {
+ if (PREDICT_FALSE(n_left_to_next == 0))
+ {
+ if (f)
+ {
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, feature_node_index, f);
+ }
+
+ f = vlib_get_frame_to_node (vm, feature_node_index);
+ to_next = vlib_frame_vector_args (f);
+ n_left_to_next = VLIB_FRAME_SIZE;
+ n_this_frame = 0;
+ }
+
+ bi0 = from[0];
+ if (PREDICT_TRUE(n_left_from > 1))
+ {
+ pi0 = from[1];
+ vlib_prefetch_buffer_with_index(vm,pi0,LOAD);
+ }
+
+ bnext0 = b0 = vlib_get_buffer (vm, bi0);
+
+ // Mark that this buffer was just recycled
+ b0->flags |= VLIB_BUFFER_IS_RECYCLED;
+
+ // If buffer is traced, mark frame as traced
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ f->flags |= VLIB_FRAME_TRACE;
+
+ while (bnext0->flags & VLIB_BUFFER_NEXT_PRESENT)
+ {
+ from += 1;
+ n_left_from -= 1;
+ bnext0 = vlib_get_buffer (vm, bnext0->next_buffer);
+ }
+ to_next[0] = bi0;
+
+ from++;
+ to_next++;
+ n_this_frame++;
+ n_left_to_next--;
+ n_left_from--;
+ }
+ }
+
+ vec_reset_length (fl->aligned_buffers);
+ vec_reset_length (fl->unaligned_buffers);
+
+ if (f)
+ {
+ ASSERT(n_this_frame);
+ f->n_vectors = n_this_frame;
+ vlib_put_frame_to_node (vm, feature_node_index, f);
+ }
+}
+
+
+
+clib_error_t *replication_init (vlib_main_t *vm)
+{
+ replication_main_t * rm = &replication_main;
+ vlib_buffer_main_t * bm = vm->buffer_main;
+ vlib_buffer_free_list_t * fl;
+ __attribute__((unused)) replication_context_t * ctx;
+ vlib_thread_main_t * tm = vlib_get_thread_main();
+
+ rm->vlib_main = vm;
+ rm->vnet_main = vnet_get_main();
+ rm->recycle_list_index =
+ vlib_buffer_create_free_list (vm, 1024 /* fictional */,
+ "replication-recycle");
+
+ fl = pool_elt_at_index (bm->buffer_free_list_pool,
+ rm->recycle_list_index);
+
+ fl->buffers_added_to_freelist_function = replication_recycle_callback;
+
+ // Verify the replication context is the expected size
+ ASSERT(sizeof(replication_context_t) == 128); // 2 cache lines
+
+ vec_validate (rm->contexts, tm->n_vlib_mains - 1);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (replication_init);
diff --git a/vnet/vnet/replication.h b/vnet/vnet/replication.h
new file mode 100644
index 00000000000..9de5717f4d3
--- /dev/null
+++ b/vnet/vnet/replication.h
@@ -0,0 +1,123 @@
+/*
+ * replication.h : packet replication
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_replication_h
+#define included_replication_h
+
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/replication.h>
+
+
+typedef struct {
+
+ // The entire vnet buffer header restored for each replica
+ u8 vnet_buffer[32]; // 16B aligned to allow vector unit copy
+ u8 reserved[32]; // space for future expansion of vnet buffer header
+
+ // feature state used during this replication
+ u64 feature_replicas; // feature's id for its set of replicas
+ u32 feature_counter; // feature's current index into set of replicas
+ u32 recycle_node_index; // feature's recycle node index
+
+ // data saved from the start of replication and restored at the end of replication
+ u32 saved_clone_count; // from vlib buffer
+ u32 saved_free_list_index; // from vlib buffer
+
+ // data saved from the original packet and restored for each replica
+ u64 l2_header[3]; // 24B (must be at least 22B for l2 packets)
+ u16 ip_tos; // v4 and v6
+ u16 ip4_checksum; // needed for v4 only
+
+ // data saved from the vlib buffer header and restored for each replica
+ i16 current_data; // offset of first byte of packet in packet data
+ u8 pad[6]; // to 64B
+ u8 l2_packet; // flag for l2 vs l3 packet data
+
+} replication_context_t; // 128B
+
+
+typedef struct {
+
+ u32 recycle_list_index;
+
+ // per-thread pools of replication contexts
+ replication_context_t ** contexts;
+
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+
+} replication_main_t;
+
+
+extern replication_main_t replication_main;
+
+
+// Return 1 if this buffer just came from the replication recycle handler.
+always_inline u32
+replication_is_recycled (vlib_buffer_t * b0)
+{
+ return b0->flags & VLIB_BUFFER_IS_RECYCLED;
+}
+
+// Clear the recycle flag. If buffer came from the replication recycle
+// handler, this flag must be cleared before the packet is transmitted again.
+always_inline void
+replication_clear_recycled (vlib_buffer_t * b0)
+{
+ b0->flags &= ~VLIB_BUFFER_IS_RECYCLED;
+}
+
+// Return the active replication context if this buffer has
+// been recycled, otherwise return 0. (Note that this essentially
+// restricts access to the replication context to the replication
+// feature's prep and recycle nodes.)
+always_inline replication_context_t *
+replication_get_ctx (vlib_buffer_t * b0)
+{
+ replication_main_t * rm = &replication_main;
+
+ return replication_is_recycled (b0) ?
+ pool_elt_at_index (rm->contexts[os_get_cpu_number()], b0->clone_count) :
+ 0;
+}
+
+// Prefetch the replication context for this buffer, if it exists
+always_inline void
+replication_prefetch_ctx (vlib_buffer_t * b0)
+{
+ replication_context_t *ctx = replication_get_ctx (b0);
+
+ if (ctx) {
+ CLIB_PREFETCH (ctx, (2*CLIB_CACHE_LINE_BYTES), STORE);
+ }
+}
+
+replication_context_t *
+replication_prep (vlib_main_t * vm,
+ vlib_buffer_t * b0,
+ u32 recycle_node_index,
+ u32 l2_packet);
+
+replication_context_t *
+replication_recycle (vlib_main_t * vm,
+ vlib_buffer_t * b0,
+ u32 is_last);
+
+
+#endif
diff --git a/vnet/vnet/rewrite.c b/vnet/vnet/rewrite.c
new file mode 100644
index 00000000000..6c33a2ebcd8
--- /dev/null
+++ b/vnet/vnet/rewrite.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * rewrite.c: packet rewrite
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/ip/lookup.h>
+
+void vnet_rewrite_copy_slow_path (vnet_rewrite_data_t * p0,
+ vnet_rewrite_data_t * rw0,
+ word n_left,
+ uword most_likely_size)
+{
+ uword n_done = round_pow2 (most_likely_size, sizeof (rw0[0])) / sizeof (rw0[0]);
+
+ p0 -= n_done;
+ rw0 -= n_done;
+
+ /* As we enter the cleanup loop, p0 and rw0 point to the last chunk written
+ by the fast path. Hence, the constant 1, which the
+ vnet_rewrite_copy_one macro renders as p0[-1] = rw0[-1]. */
+
+ while (n_left > 0)
+ {
+ vnet_rewrite_copy_one (p0, rw0, 1);
+ p0--;
+ rw0--;
+ n_left--;
+ }
+}
+
+u8 * format_vnet_rewrite (u8 * s, va_list * args)
+{
+ vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+ vnet_rewrite_header_t * rw = va_arg (*args, vnet_rewrite_header_t *);
+ u32 max_data_bytes = va_arg (*args, u32);
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_node_t * next;
+ uword indent;
+
+ next = vlib_get_next_node (vm, rw->node_index, rw->next_index);
+
+ indent = format_get_indent (s);
+
+ if (rw->sw_if_index != ~0)
+ {
+ vnet_sw_interface_t * si;
+ si = vnet_get_sw_interface (vnm, rw->sw_if_index);
+ s = format (s, "%U", format_vnet_sw_interface_name, vnm, si);
+ }
+ else
+ s = format (s, "%v", next->name);
+
+ /* Format rewrite string. */
+ if (rw->data_bytes > 0)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ next->format_buffer ? next->format_buffer : format_hex_bytes,
+ rw->data + max_data_bytes - rw->data_bytes,
+ rw->data_bytes);
+
+ return s;
+}
+
+u8 * format_vnet_rewrite_header (u8 * s, va_list * args)
+{
+ vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+ vnet_rewrite_header_t * rw = va_arg (*args, vnet_rewrite_header_t *);
+ u8 * packet_data = va_arg (*args, u8 *);
+ u32 packet_data_bytes = va_arg (*args, u32);
+ vlib_node_t * next;
+
+ next = vlib_get_next_node (vm, rw->node_index, rw->next_index);
+
+ /* Format rewrite string. */
+ s = format (s, "%U",
+ next->format_buffer ? next->format_buffer : format_hex_bytes,
+ packet_data, packet_data_bytes);
+
+ return s;
+}
+
+uword unformat_vnet_rewrite (unformat_input_t * input, va_list * args)
+{
+ vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+ vnet_rewrite_header_t * rw = va_arg (*args, vnet_rewrite_header_t *);
+ u32 max_data_bytes = va_arg (*args, u32);
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_node_t * next;
+ u32 next_index, sw_if_index, max_packet_bytes, error;
+ u8 * rw_data;
+
+ rw_data = 0;
+ sw_if_index = ~0;
+ max_packet_bytes = ~0;
+ error = 1;
+
+ /* Parse sw interface. */
+ if (unformat (input, "%U",
+ unformat_vnet_sw_interface, vnm, &sw_if_index))
+ {
+ vnet_hw_interface_t * hi;
+
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+
+ next_index = hi->output_node_index;
+ max_packet_bytes = hi->max_l3_packet_bytes[VLIB_RX];
+ }
+
+ else if (unformat (input, "%U",
+ unformat_vlib_node, vm, &next_index))
+ ;
+
+ else
+ goto done;
+
+ next = vlib_get_node (vm, next_index);
+
+ if (next->unformat_buffer
+ && unformat_user (input, next->unformat_buffer, &rw_data))
+ ;
+
+ else if (unformat_user (input, unformat_hex_string, &rw_data)
+ || unformat (input, "0x%U", unformat_hex_string, &rw_data))
+ ;
+
+ else
+ goto done;
+
+ /* Re-write does not fit. */
+ if (vec_len (rw_data) >= max_data_bytes)
+ goto done;
+
+ {
+ u32 tmp;
+
+ if (unformat (input, "mtu %d", &tmp)
+ && tmp < (1 << BITS (rw->max_l3_packet_bytes)))
+ max_packet_bytes = tmp;
+ }
+
+ error = 0;
+ rw->sw_if_index = sw_if_index;
+ rw->max_l3_packet_bytes = max_packet_bytes;
+ rw->next_index = vlib_node_add_next (vm, rw->node_index, next_index);
+ vnet_rewrite_set_data_internal (rw, max_data_bytes, rw_data, vec_len (rw_data));
+
+ done:
+ vec_free (rw_data);
+ return error == 0;
+}
+
+void vnet_rewrite_for_sw_interface (vnet_main_t * vnm,
+ vnet_l3_packet_type_t packet_type,
+ u32 sw_if_index,
+ u32 node_index,
+ void * dst_address,
+ vnet_rewrite_header_t * rw,
+ u32 max_rewrite_bytes)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ vnet_hw_interface_class_t * hc = vnet_get_hw_interface_class (vnm, hw->hw_class_index);
+ static u8 * rw_tmp = 0;
+ uword n_rw_tmp;
+
+ rw->sw_if_index = sw_if_index;
+ rw->node_index = node_index;
+ rw->next_index = vlib_node_add_next (vnm->vlib_main, node_index, hw->output_node_index);
+ rw->max_l3_packet_bytes = hw->max_l3_packet_bytes[VLIB_TX];
+
+ ASSERT (max_rewrite_bytes > 0);
+ vec_reset_length (rw_tmp);
+ vec_validate (rw_tmp, max_rewrite_bytes - 1);
+
+ ASSERT (hc->set_rewrite);
+ n_rw_tmp = hc->set_rewrite (vnm, sw_if_index, packet_type, dst_address, rw_tmp, max_rewrite_bytes);
+
+ ASSERT (n_rw_tmp >= 0 && n_rw_tmp < max_rewrite_bytes);
+ vnet_rewrite_set_data_internal (rw, max_rewrite_bytes, rw_tmp, n_rw_tmp);
+}
+
+void vnet_rewrite_for_tunnel (vnet_main_t * vnm,
+ u32 tx_sw_if_index,
+ u32 rewrite_node_index,
+ u32 post_rewrite_node_index,
+ vnet_rewrite_header_t * rw,
+ u8 *rewrite_data,
+ u32 rewrite_length)
+{
+ ip_adjacency_t * adj = 0;
+ /*
+ * Installed into vnet_buffer(b)->sw_if_index[VLIB_TX] e.g.
+ * by ip4_rewrite_inline. If the post-rewrite node injects into
+ * ipX-forward, this will be interpreted as a FIB number.
+ */
+ rw->sw_if_index = tx_sw_if_index;
+ rw->node_index = rewrite_node_index;
+ rw->next_index = vlib_node_add_next (vnm->vlib_main, rewrite_node_index,
+ post_rewrite_node_index);
+ rw->max_l3_packet_bytes = (u16) ~0; /* we can't know at this point */
+
+ ASSERT (rewrite_length < sizeof (adj->rewrite_data));
+ /* Leave room for ethernet + VLAN tag */
+ vnet_rewrite_set_data_internal (rw, sizeof(adj->rewrite_data),
+ rewrite_data, rewrite_length);
+}
+
+void serialize_vnet_rewrite (serialize_main_t * m, va_list * va)
+{
+ vnet_rewrite_header_t * rw = va_arg (*va, vnet_rewrite_header_t *);
+ u32 max_data_bytes = va_arg (*va, u32);
+ u8 * p;
+
+ serialize_integer (m, rw->sw_if_index, sizeof (rw->sw_if_index));
+ serialize_integer (m, rw->data_bytes, sizeof (rw->data_bytes));
+ serialize_integer (m, rw->max_l3_packet_bytes, sizeof (rw->max_l3_packet_bytes));
+ p = serialize_get (m, rw->data_bytes);
+ memcpy (p, vnet_rewrite_get_data_internal (rw, max_data_bytes), rw->data_bytes);
+}
+
+void unserialize_vnet_rewrite (serialize_main_t * m, va_list * va)
+{
+ vnet_rewrite_header_t * rw = va_arg (*va, vnet_rewrite_header_t *);
+ u32 max_data_bytes = va_arg (*va, u32);
+ u8 * p;
+
+ /* It is up to user to fill these in. */
+ rw->node_index = ~0;
+ rw->next_index = ~0;
+
+ unserialize_integer (m, &rw->sw_if_index, sizeof (rw->sw_if_index));
+ unserialize_integer (m, &rw->data_bytes, sizeof (rw->data_bytes));
+ unserialize_integer (m, &rw->max_l3_packet_bytes, sizeof (rw->max_l3_packet_bytes));
+ p = unserialize_get (m, rw->data_bytes);
+ memcpy (vnet_rewrite_get_data_internal (rw, max_data_bytes), p, rw->data_bytes);
+}
diff --git a/vnet/vnet/rewrite.h b/vnet/vnet/rewrite.h
new file mode 100644
index 00000000000..2e5b7f43948
--- /dev/null
+++ b/vnet/vnet/rewrite.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * rewrite.h: packet rewrite
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_rewrite_h
+#define included_vnet_rewrite_h
+
+#include <vlib/vlib.h>
+#include <vnet/l3_types.h>
+
+/* Consider using vector types for speed? */
+typedef uword vnet_rewrite_data_t;
+
+typedef CLIB_PACKED (struct {
+ /* Interface to mark re-written packets with. */
+ u32 sw_if_index;
+
+ /* Packet processing node where rewrite happens. */
+ u32 node_index;
+
+ /* Next node to feed after packet rewrite is done. */
+ u16 next_index;
+
+ /* Number of bytes in rewrite data. */
+ u16 data_bytes;
+
+ /* Max packet size layer 3 (MTU) for output interface.
+ Used for MTU check after packet rewrite. */
+ u16 max_l3_packet_bytes;
+
+ /* Rewrite string starting at end and going backwards. */
+ u8 data[0];
+}) vnet_rewrite_header_t;
+
+/*
+ Helper macro for declaring rewrite string w/ given max-size.
+
+ Typical usage:
+ typedef struct {
+ // User data.
+ int a, b;
+
+ // Total adjacency is 64 bytes.
+ vnet_rewrite_declare(64 - 2*sizeof(int)) rw;
+ } my_adjacency_t;
+*/
+#define vnet_declare_rewrite(total_bytes) \
+struct { \
+ vnet_rewrite_header_t rewrite_header; \
+ \
+ u8 rewrite_data[(total_bytes) - sizeof (vnet_rewrite_header_t)]; \
+}
+
+always_inline void
+vnet_rewrite_set_data_internal (vnet_rewrite_header_t * rw,
+ int max_size,
+ void * data,
+ int data_bytes)
+{
+ /* Sanity check values carefully for this memset operation*/
+ ASSERT ((max_size > 0) && (max_size < VLIB_BUFFER_PRE_DATA_SIZE));
+ ASSERT ((data_bytes >= 0) && (data_bytes < max_size));
+
+ rw->data_bytes = data_bytes;
+ memcpy (rw->data + max_size - data_bytes, data, data_bytes);
+ memset (rw->data, 0xfe, max_size - data_bytes);
+}
+
+#define vnet_rewrite_set_data(rw,data,data_bytes) \
+ vnet_rewrite_set_data_internal (&((rw).rewrite_header), \
+ sizeof ((rw).rewrite_data), \
+ (data), \
+ (data_bytes))
+
+always_inline void *
+vnet_rewrite_get_data_internal (vnet_rewrite_header_t * rw, int max_size)
+{
+ ASSERT (rw->data_bytes <= max_size);
+ return rw->data + max_size - rw->data_bytes;
+}
+
+#define vnet_rewrite_get_data(rw) \
+ vnet_rewrite_get_data_internal (&((rw).rewrite_header), sizeof ((rw).rewrite_data))
+
+always_inline void
+vnet_rewrite_copy_one (vnet_rewrite_data_t * p0, vnet_rewrite_data_t * rw0, int i)
+{
+ p0[-i] = rw0[-i];
+}
+
+void vnet_rewrite_copy_slow_path (vnet_rewrite_data_t * p0,
+ vnet_rewrite_data_t * rw0,
+ word n_left,
+ uword most_likely_size);
+
+typedef CLIB_PACKED (struct {
+ u64 a;
+ u32 b;
+ u16 c;
+}) eh_copy_t;
+
+always_inline void
+_vnet_rewrite_one_header (vnet_rewrite_header_t * h0,
+ void * packet0,
+ int max_size,
+ int most_likely_size)
+{
+ vnet_rewrite_data_t * p0 = packet0;
+ vnet_rewrite_data_t * rw0 = (vnet_rewrite_data_t *) (h0->data + max_size);
+ word n_left0;
+
+ /* 0xfefe => poisoned adjacency => crash */
+ ASSERT (h0->data_bytes != 0xfefe);
+
+ if (PREDICT_TRUE (h0->data_bytes == sizeof (eh_copy_t)))
+ {
+ eh_copy_t * s, * d;
+ s = (eh_copy_t *)(h0->data + max_size - sizeof (eh_copy_t));
+ d = (eh_copy_t *)(((u8 *)packet0) - sizeof (eh_copy_t));
+ __builtin_memcpy (d, s, sizeof (eh_copy_t));
+ return;
+ }
+
+
+#define _(i) \
+ do { \
+ if (most_likely_size > ((i)-1)*sizeof (vnet_rewrite_data_t)) \
+ vnet_rewrite_copy_one (p0, rw0, (i)); \
+ } while (0)
+
+ _ (4);
+ _ (3);
+ _ (2);
+ _ (1);
+
+#undef _
+
+ n_left0 = (int)
+ (((int) h0->data_bytes - most_likely_size) + (sizeof(rw0[0])-1))
+ / (int) sizeof (rw0[0]);
+ if (PREDICT_FALSE (n_left0 > 0))
+ vnet_rewrite_copy_slow_path (p0, rw0, n_left0, most_likely_size);
+}
+
+always_inline void
+_vnet_rewrite_two_headers (vnet_rewrite_header_t * h0,
+ vnet_rewrite_header_t * h1,
+ void * packet0,
+ void * packet1,
+ int max_size,
+ int most_likely_size)
+{
+ vnet_rewrite_data_t * p0 = packet0;
+ vnet_rewrite_data_t * p1 = packet1;
+ vnet_rewrite_data_t * rw0 = (vnet_rewrite_data_t *) (h0->data + max_size);
+ vnet_rewrite_data_t * rw1 = (vnet_rewrite_data_t *) (h1->data + max_size);
+ word n_left0, n_left1;
+ int slow_path;
+
+ /* 0xfefe => poisoned adjacency => crash */
+ ASSERT (h0->data_bytes != 0xfefe);
+ ASSERT (h1->data_bytes != 0xfefe);
+
+ /* Arithmetic calculation: bytes0 == bytes1 == 14 */
+ slow_path = h0->data_bytes ^ h1->data_bytes;
+ slow_path += h0->data_bytes ^ sizeof (eh_copy_t);
+
+ if (PREDICT_TRUE (slow_path == 0))
+ {
+ eh_copy_t * s0, * d0, * s1, * d1;
+ s0 = (eh_copy_t *)(h0->data + max_size - sizeof (eh_copy_t));
+ d0 = (eh_copy_t *)(((u8 *)packet0) - sizeof (eh_copy_t));
+ __builtin_memcpy (d0, s0, sizeof (eh_copy_t));
+ s1 = (eh_copy_t *)(h1->data + max_size - sizeof (eh_copy_t));
+ d1 = (eh_copy_t *)(((u8 *)packet1) - sizeof (eh_copy_t));
+ __builtin_memcpy (d1, s1, sizeof (eh_copy_t));
+ return;
+ }
+
+#define _(i) \
+ do { \
+ if (most_likely_size > ((i)-1)*sizeof (vnet_rewrite_data_t)) \
+ { \
+ vnet_rewrite_copy_one (p0, rw0, (i)); \
+ vnet_rewrite_copy_one (p1, rw1, (i)); \
+ } \
+ } while (0)
+
+ _ (4);
+ _ (3);
+ _ (2);
+ _ (1);
+
+#undef _
+
+ n_left0 = (int)
+ (((int) h0->data_bytes - most_likely_size) + (sizeof(rw0[0])-1))
+ / (int) sizeof (rw0[0]);
+ n_left1 = (int)
+ (((int) h1->data_bytes - most_likely_size) + (sizeof(rw1[0])-1))
+ / (int) sizeof (rw1[0]);
+
+ if (PREDICT_FALSE (n_left0 > 0 || n_left1 > 0))
+ {
+ vnet_rewrite_copy_slow_path (p0, rw0, n_left0, most_likely_size);
+ vnet_rewrite_copy_slow_path (p1, rw1, n_left1, most_likely_size);
+ }
+}
+
+#define vnet_rewrite_one_header(rw0,p0,most_likely_size) \
+ _vnet_rewrite_one_header (&((rw0).rewrite_header), (p0), \
+ sizeof ((rw0).rewrite_data), \
+ (most_likely_size))
+
+#define vnet_rewrite_two_headers(rw0,rw1,p0,p1,most_likely_size) \
+ _vnet_rewrite_two_headers (&((rw0).rewrite_header), &((rw1).rewrite_header), \
+ (p0), (p1), \
+ sizeof ((rw0).rewrite_data), \
+ (most_likely_size))
+
+#define VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST ((void *) 0)
+void vnet_rewrite_for_sw_interface (struct vnet_main_t * vnm,
+ vnet_l3_packet_type_t packet_type,
+ u32 sw_if_index,
+ u32 node_index,
+ void * dst_address,
+ vnet_rewrite_header_t * rw,
+ u32 max_rewrite_bytes);
+
+void vnet_rewrite_for_tunnel (struct vnet_main_t * vnm,
+ u32 tx_sw_if_index,
+ u32 rewrite_node_index,
+ u32 post_rewrite_node_index,
+ vnet_rewrite_header_t * rw,
+ u8 *rewrite_data,
+ u32 rewrite_length);
+
+/* Parser for unformat header & rewrite string. */
+unformat_function_t unformat_vnet_rewrite;
+
+format_function_t format_vnet_rewrite;
+format_function_t format_vnet_rewrite_header;
+
+serialize_function_t serialize_vnet_rewrite, unserialize_vnet_rewrite;
+
+#endif /* included_vnet_rewrite_h */
diff --git a/vnet/vnet/snap/node.c b/vnet/vnet/snap/node.c
new file mode 100644
index 00000000000..83d373a22de
--- /dev/null
+++ b/vnet/vnet/snap/node.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * snap_node.c: snap packet processing
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/llc/llc.h>
+#include <vnet/snap/snap.h>
+
+typedef enum {
+ SNAP_INPUT_NEXT_DROP,
+ SNAP_INPUT_NEXT_PUNT,
+ SNAP_INPUT_NEXT_ETHERNET_TYPE,
+ SNAP_INPUT_N_NEXT,
+} snap_input_next_t;
+
+typedef struct {
+ u8 packet_data[32];
+} snap_input_trace_t;
+
+static u8 * format_snap_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ snap_input_trace_t * t = va_arg (*va, snap_input_trace_t *);
+
+ s = format (s, "%U", format_snap_header, t->packet_data);
+
+ return s;
+}
+
+static uword
+snap_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ snap_main_t * sm = &snap_main;
+ u32 n_left_from, next_index, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (snap_input_trace_t));
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ snap_header_t * h0, * h1;
+ snap_protocol_info_t * pi0, * pi1;
+ u8 next0, next1, is_ethernet0, is_ethernet1, len0, len1, enqueue_code;
+ u32 oui0, oui1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+
+ CLIB_PREFETCH (b2->data, sizeof (h0[0]), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (h1[0]), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ h0 = (void *) (b0->data + b0->current_data);
+ h1 = (void *) (b1->data + b1->current_data);
+
+ oui0 = snap_header_get_oui (h0);
+ oui1 = snap_header_get_oui (h1);
+
+ is_ethernet0 = oui0 == IEEE_OUI_ethernet;
+ is_ethernet1 = oui1 == IEEE_OUI_ethernet;
+
+ len0 = sizeof (h0[0]) - (is_ethernet0 ? sizeof (h0->protocol) : 0);
+ len1 = sizeof (h1[0]) - (is_ethernet1 ? sizeof (h1->protocol) : 0);
+
+ b0->current_data += len0;
+ b1->current_data += len1;
+
+ b0->current_length -= len0;
+ b1->current_length -= len1;
+
+ pi0 = snap_get_protocol_info (sm, h0);
+ pi1 = snap_get_protocol_info (sm, h1);
+
+ next0 = pi0 ? pi0->next_index : SNAP_INPUT_NEXT_DROP;
+ next1 = pi1 ? pi1->next_index : SNAP_INPUT_NEXT_DROP;
+
+ next0 = is_ethernet0 ? SNAP_INPUT_NEXT_ETHERNET_TYPE : next0;
+ next1 = is_ethernet1 ? SNAP_INPUT_NEXT_ETHERNET_TYPE : next1;
+
+ /* In case of error. */
+ b0->error = node->errors[SNAP_ERROR_UNKNOWN_PROTOCOL];
+ b1->error = node->errors[SNAP_ERROR_UNKNOWN_PROTOCOL];
+
+ enqueue_code = (next0 != next_index) + 2*(next1 != next_index);
+
+ if (PREDICT_FALSE (enqueue_code != 0))
+ {
+ switch (enqueue_code)
+ {
+ case 1:
+ /* A B A */
+ to_next[-2] = bi1;
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ break;
+
+ case 2:
+ /* A A B */
+ to_next -= 1;
+ n_left_to_next += 1;
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ break;
+
+ case 3:
+ /* A B B or A B C */
+ to_next -= 2;
+ n_left_to_next += 2;
+ vlib_set_next_frame_buffer (vm, node, next0, bi0);
+ vlib_set_next_frame_buffer (vm, node, next1, bi1);
+ if (next0 == next1)
+ {
+ vlib_put_next_frame (vm, node, next_index,
+ n_left_to_next);
+ next_index = next1;
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+ }
+ }
+ }
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ snap_header_t * h0;
+ snap_protocol_info_t * pi0;
+ u8 next0, is_ethernet0, len0;
+ u32 oui0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ h0 = (void *) (b0->data + b0->current_data);
+
+ oui0 = snap_header_get_oui (h0);
+
+ is_ethernet0 = oui0 == IEEE_OUI_ethernet;
+
+ len0 = sizeof (h0[0]) - (is_ethernet0 ? sizeof (h0->protocol) : 0);
+
+ b0->current_data += len0;
+
+ b0->current_length -= len0;
+
+ pi0 = snap_get_protocol_info (sm, h0);
+
+ next0 = pi0 ? pi0->next_index : SNAP_INPUT_NEXT_DROP;
+
+ next0 = is_ethernet0 ? SNAP_INPUT_NEXT_ETHERNET_TYPE : next0;
+
+ /* In case of error. */
+ b0->error = node->errors[SNAP_ERROR_UNKNOWN_PROTOCOL];
+
+ /* Sent packet to wrong next? */
+ if (PREDICT_FALSE (next0 != next_index))
+ {
+ /* Return old frame; remove incorrectly enqueued packet. */
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);
+
+ /* Send to correct next. */
+ next_index = next0;
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ to_next[0] = bi0;
+ to_next += 1;
+ n_left_to_next -= 1;
+ }
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * snap_error_strings[] = {
+#define _(f,s) s,
+ foreach_snap_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (snap_input_node) = {
+ .function = snap_input,
+ .name = "snap-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = SNAP_N_ERROR,
+ .error_strings = snap_error_strings,
+
+ .n_next_nodes = SNAP_INPUT_N_NEXT,
+ .next_nodes = {
+ [SNAP_INPUT_NEXT_DROP] = "error-drop",
+ [SNAP_INPUT_NEXT_PUNT] = "error-punt",
+ [SNAP_INPUT_NEXT_ETHERNET_TYPE] = "ethernet-input-type",
+ },
+
+ .format_buffer = format_snap_header_with_length,
+ .format_trace = format_snap_input_trace,
+ .unformat_buffer = unformat_snap_header,
+};
+
+static clib_error_t * snap_input_init (vlib_main_t * vm)
+{
+ {
+ clib_error_t * error = vlib_call_init_function (vm, snap_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ snap_setup_node (vm, snap_input_node.index);
+
+ llc_register_input_protocol (vm, LLC_PROTOCOL_snap, snap_input_node.index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (snap_input_init);
+
+void
+snap_register_input_protocol (vlib_main_t * vm,
+ char * name,
+ u32 ieee_oui,
+ u16 protocol,
+ u32 node_index)
+{
+ snap_main_t * sm = &snap_main;
+ snap_protocol_info_t * pi;
+ snap_header_t h;
+ snap_oui_and_protocol_t key;
+
+ {
+ clib_error_t * error = vlib_call_init_function (vm, snap_input_init);
+ if (error)
+ clib_error_report (error);
+ }
+
+ h.protocol = clib_host_to_net_u16 (protocol);
+ h.oui[0] = (ieee_oui >> 16) & 0xff;
+ h.oui[1] = (ieee_oui >> 8) & 0xff;
+ h.oui[2] = (ieee_oui >> 0) & 0xff;
+ pi = snap_get_protocol_info (sm, &h);
+ if (pi)
+ return;
+
+ vec_add2 (sm->protocols, pi, 1);
+
+ pi->name = format (0, "%s", name);
+ pi->node_index = node_index;
+ pi->next_index = vlib_node_add_next (vm,
+ snap_input_node.index,
+ node_index);
+
+ key.oui = ieee_oui;
+ key.protocol = clib_host_to_net_u16 (protocol);
+
+ mhash_set (&sm->protocol_hash, &key, pi - sm->protocols, /* old_value */ 0);
+ hash_set_mem (sm->protocol_info_by_name, name, pi - sm->protocols);
+}
diff --git a/vnet/vnet/snap/pg.c b/vnet/vnet/snap/pg.c
new file mode 100644
index 00000000000..74c363fcc69
--- /dev/null
+++ b/vnet/vnet/snap/pg.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * snap_pg.c: packet generator snap interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/snap/snap.h>
+
+typedef struct {
+ pg_edit_t oui;
+ pg_edit_t protocol;
+} pg_snap_header_t;
+
+static inline void
+pg_snap_header_init (pg_snap_header_t * e)
+{
+ pg_edit_init (&e->oui, snap_header_t, oui);
+ pg_edit_init (&e->protocol, snap_header_t, protocol);
+}
+
+uword
+unformat_pg_snap_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_snap_header_t * h;
+ u32 group_index, error;
+
+ h = pg_create_edit_group (s, sizeof (h[0]), sizeof (snap_header_t),
+ &group_index);
+ pg_snap_header_init (h);
+
+ error = 1;
+ if (! unformat (input, "%U -> %U",
+ unformat_pg_edit,
+ unformat_snap_protocol, &h->oui, &h->protocol))
+ goto done;
+
+ {
+ snap_main_t * pm = &snap_main;
+ snap_protocol_info_t * pi = 0;
+ pg_node_t * pg_node = 0;
+
+ if (h->oui.type == PG_EDIT_FIXED
+ && h->protocol.type == PG_EDIT_FIXED)
+ {
+ u8 * o = h->oui.values[PG_EDIT_LO];
+ u8 * p = h->protocol.values[PG_EDIT_LO];
+ snap_header_t h;
+
+ h.oui[0] = o[0];
+ h.oui[1] = o[1];
+ h.oui[2] = o[2];
+ h.protocol = *(u16 *) p;
+ pi = snap_get_protocol_info (pm, &h);
+ if (pi && pi->node_index != ~0)
+ pg_node = pg_get_node (pi->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ error = 0;
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/snap/snap.c b/vnet/vnet/snap/snap.c
new file mode 100644
index 00000000000..df15f8614c6
--- /dev/null
+++ b/vnet/vnet/snap/snap.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * snap.c: snap support
+ *
+ * Copyright (c) 2010 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/snap/snap.h>
+#include <vnet/ethernet/ethernet.h>
+
+/* Global main structure. */
+snap_main_t snap_main;
+
+static u8 * format_cisco_snap_protocol (u8 * s, va_list * args)
+{
+ snap_header_t * h = va_arg (*args, snap_header_t *);
+ u16 protocol = clib_net_to_host_u16 (h->protocol);
+ char * t = 0;
+ switch (protocol)
+ {
+#define _(n,f) case n: t = #f; break;
+ foreach_snap_cisco_protocol;
+#undef _
+ default: break;
+ }
+ if (t)
+ return format (s, "%s", t);
+ else
+ return format (s, "unknown 0x%x", protocol);
+}
+
+u8 * format_snap_protocol (u8 * s, va_list * args)
+{
+ snap_header_t * h = va_arg (*args, snap_header_t *);
+ u32 oui = snap_header_get_oui (h);
+ u16 protocol = clib_net_to_host_u16 (h->protocol);
+
+ switch (oui)
+ {
+ case IEEE_OUI_ethernet:
+ return format (s, "ethernet %U", format_ethernet_type, h->protocol);
+
+ case IEEE_OUI_cisco:
+ return format (s, "cisco %U", format_cisco_snap_protocol, h);
+
+ default:
+ return format (s, "oui 0x%06x 0x%04x", oui, protocol);
+ }
+}
+
+u8 * format_snap_header_with_length (u8 * s, va_list * args)
+{
+ snap_main_t * sm = &snap_main;
+ snap_header_t * h = va_arg (*args, snap_header_t *);
+ snap_protocol_info_t * pi = snap_get_protocol_info (sm, h);
+ u32 max_header_bytes = va_arg (*args, u32);
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "snap header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "SNAP %U", format_snap_protocol, h);
+
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes && pi != 0)
+ {
+ vlib_node_t * node = vlib_get_node (sm->vlib_main, pi->node_index);
+ if (node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) (h + 1),
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_snap_header (u8 * s, va_list * args)
+{
+ snap_header_t * h = va_arg (*args, snap_header_t *);
+ return format (s, "%U", format_snap_header_with_length, h, 0);
+}
+
+/* Returns snap protocol as an int in host byte order. */
+uword
+unformat_snap_protocol (unformat_input_t * input, va_list * args)
+{
+ snap_header_t * result = va_arg (*args, snap_header_t *);
+ snap_main_t * sm = &snap_main;
+ snap_oui_and_protocol_t p;
+ u32 i;
+
+ /* Numeric type. */
+ if (unformat (input, "0x%x 0x%x", &p.oui, &p.protocol))
+ {
+ if (p.oui >= (1 << 24))
+ return 0;
+ if (p.protocol >= (1 << 16))
+ return 0;
+ }
+
+ /* Named type. */
+ else if (unformat_user (input, unformat_vlib_number_by_name,
+ sm->protocol_info_by_name, &i))
+ {
+ snap_protocol_info_t * pi = vec_elt_at_index (sm->protocols, i);
+ p = pi->oui_and_protocol;
+ }
+
+ else
+ return 0;
+
+ snap_header_set_protocol (result, &p);
+ return 1;
+}
+
+uword
+unformat_snap_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ snap_header_t _h, * h = &_h;
+
+ if (! unformat (input, "%U", unformat_snap_protocol, h))
+ return 0;
+
+ /* Add header to result. */
+ {
+ void * p;
+ u32 n_bytes = sizeof (h[0]);
+
+ vec_add2 (*result, p, n_bytes);
+ memcpy (p, h, n_bytes);
+ }
+
+ return 1;
+}
+
+static clib_error_t * snap_init (vlib_main_t * vm)
+{
+ snap_main_t * sm = &snap_main;
+
+ memset (sm, 0, sizeof (sm[0]));
+ sm->vlib_main = vm;
+
+ mhash_init (&sm->protocol_hash, sizeof (uword), sizeof (snap_oui_and_protocol_t));
+
+ sm->protocol_info_by_name
+ = hash_create_string (/* elts */ 0, sizeof (uword));
+
+ return vlib_call_init_function (vm, snap_input_init);
+}
+
+VLIB_INIT_FUNCTION (snap_init);
+
diff --git a/vnet/vnet/snap/snap.h b/vnet/vnet/snap/snap.h
new file mode 100644
index 00000000000..ed4a10e207e
--- /dev/null
+++ b/vnet/vnet/snap/snap.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * snap.h: SNAP definitions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_snap_h
+#define included_snap_h
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+
+#define foreach_ieee_oui \
+ _ (0x000000, ethernet) \
+ _ (0x00000c, cisco)
+
+typedef enum {
+#define _(n,f) IEEE_OUI_##f = n,
+ foreach_ieee_oui
+#undef _
+} ieee_oui_t;
+
+#define foreach_snap_cisco_protocol \
+ _ (0x0102, drip) \
+ _ (0x0104, port_aggregation_protocol) \
+ _ (0x0105, mls_hello) \
+ _ (0x010b, per_vlan_spanning_tree) \
+ _ (0x010c, vlan_bridge) \
+ _ (0x0111, unidirectional_link_detection) \
+ _ (0x2000, cdp) \
+ _ (0x2001, cgmp) \
+ _ (0x2003, vtp) \
+ _ (0x2004, dtp) \
+ _ (0x200a, stp_uplink_fast)
+
+typedef enum {
+#define _(n,f) SNAP_cisco_##f = n,
+ foreach_snap_cisco_protocol
+#undef _
+} snap_cisco_protocol_t;
+
+typedef union {
+ CLIB_PACKED (struct {
+ /* OUI: organization unique identifier. */
+ u8 oui[3];
+
+ /* Per-OUI protocol. */
+ u16 protocol;
+ });
+
+ u8 as_u8[5];
+} snap_header_t;
+
+typedef struct {
+ u32 oui;
+ u32 protocol;
+} snap_oui_and_protocol_t;
+
+typedef struct {
+ /* Name vector string. */
+ u8 * name;
+
+ snap_oui_and_protocol_t oui_and_protocol;
+
+ /* Node which handles this type. */
+ u32 node_index;
+
+ /* snap-input next index for this type. */
+ u32 next_index;
+} snap_protocol_info_t;
+
+always_inline void
+snap_header_set_protocol (snap_header_t * h, snap_oui_and_protocol_t * p)
+{
+ u16 protocol = p->protocol;
+ u32 oui = p->oui;
+ h->protocol = clib_host_to_net_u16 (protocol);
+ h->oui[0] = (oui >> 16) & 0xff;
+ h->oui[1] = (oui >> 8) & 0xff;
+ h->oui[2] = (oui >> 0) & 0xff;
+}
+
+#define foreach_snap_error \
+ _ (NONE, "no error") \
+ _ (UNKNOWN_PROTOCOL, "unknown oui/snap protocol")
+
+typedef enum {
+#define _(f,s) SNAP_ERROR_##f,
+ foreach_snap_error
+#undef _
+ SNAP_N_ERROR,
+} snap_error_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ /* Vector of known SNAP oui/protocol pairs. */
+ snap_protocol_info_t * protocols;
+
+ /* Hash table mapping oui/protocol to protocol index. */
+ mhash_t protocol_hash;
+
+ /* Hash table mapping protocol by name. */
+ uword * protocol_info_by_name;
+} snap_main_t;
+
+always_inline u32
+snap_header_get_oui (snap_header_t * h)
+{
+ return (h->oui[0] << 16) | (h->oui[1] << 8) | h->oui[2];
+}
+
+always_inline snap_protocol_info_t *
+snap_get_protocol_info (snap_main_t * sm, snap_header_t * h)
+{
+ snap_oui_and_protocol_t key;
+ uword * p;
+
+ key.oui = snap_header_get_oui (h);
+ key.protocol = h->protocol;
+
+ p = mhash_get (&sm->protocol_hash, &key);
+ return p ? vec_elt_at_index (sm->protocols, p[0]) : 0;
+}
+
+snap_main_t snap_main;
+
+/* Register given node index to take input for given snap type. */
+void
+snap_register_input_protocol (vlib_main_t * vm,
+ char * name,
+ u32 ieee_oui,
+ u16 protocol,
+ u32 node_index);
+
+void snap_set_adjacency (vnet_rewrite_header_t * rw,
+ uword max_data_bytes,
+ u32 ieee_oui,
+ u16 protocol);
+
+format_function_t format_snap_protocol;
+format_function_t format_snap_header;
+format_function_t format_snap_header_with_length;
+
+/* Parse snap protocol as 0xXXXX or protocol name. */
+unformat_function_t unformat_snap_protocol;
+
+/* Parse snap header. */
+unformat_function_t unformat_snap_header;
+unformat_function_t unformat_pg_snap_header;
+
+always_inline void
+snap_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+
+ n->format_buffer = format_snap_header_with_length;
+ n->unformat_buffer = unformat_snap_header;
+ pn->unformat_edit = unformat_pg_snap_header;
+}
+
+#endif /* included_snap_h */
diff --git a/vnet/vnet/sr/rfc_draft_05.txt b/vnet/vnet/sr/rfc_draft_05.txt
new file mode 100644
index 00000000000..bc41c181ea4
--- /dev/null
+++ b/vnet/vnet/sr/rfc_draft_05.txt
@@ -0,0 +1,1265 @@
+Network Working Group S. Previdi, Ed.
+Internet-Draft C. Filsfils
+Intended status: Standards Track Cisco Systems, Inc.
+Expires: June 12, 2015 B. Field
+ Comcast
+ I. Leung
+ Rogers Communications
+ December 9, 2014
+
+
+ IPv6 Segment Routing Header (SRH)
+ draft-previdi-6man-segment-routing-header-05
+
+Abstract
+
+ Segment Routing (SR) allows a node to steer a packet through a
+ controlled set of instructions, called segments, by prepending a SR
+ header to the packet. A segment can represent any instruction,
+ topological or service-based. SR allows to enforce a flow through
+ any path (topological, or application/service based) while
+ maintaining per-flow state only at the ingress node to the SR domain.
+
+ Segment Routing can be applied to the IPv6 data plane with the
+ addition of a new type of Routing Extension Header. This draft
+ describes the Segment Routing Extension Header Type and how it is
+ used by SR capable nodes.
+
+Requirements Language
+
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+ document are to be interpreted as described in RFC 2119 [RFC2119].
+
+Status of This Memo
+
+ This Internet-Draft is submitted in full conformance with the
+ provisions of BCP 78 and BCP 79.
+
+ Internet-Drafts are working documents of the Internet Engineering
+ Task Force (IETF). Note that other groups may also distribute
+ working documents as Internet-Drafts. The list of current Internet-
+ Drafts is at http://datatracker.ietf.org/drafts/current/.
+
+ Internet-Drafts are draft documents valid for a maximum of six months
+ and may be updated, replaced, or obsoleted by other documents at any
+ time. It is inappropriate to use Internet-Drafts as reference
+ material or to cite them other than as "work in progress."
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 1]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ This Internet-Draft will expire on June 12, 2015.
+
+Copyright Notice
+
+ Copyright (c) 2014 IETF Trust and the persons identified as the
+ document authors. All rights reserved.
+
+ This document is subject to BCP 78 and the IETF Trust's Legal
+ Provisions Relating to IETF Documents
+ (http://trustee.ietf.org/license-info) in effect on the date of
+ publication of this document. Please review these documents
+ carefully, as they describe your rights and restrictions with respect
+ to this document. Code Components extracted from this document must
+ include Simplified BSD License text as described in Section 4.e of
+ the Trust Legal Provisions and are provided without warranty as
+ described in the Simplified BSD License.
+
+Table of Contents
+
+ 1. Structure of this document . . . . . . . . . . . . . . . . . 3
+ 2. Segment Routing Documents . . . . . . . . . . . . . . . . . . 3
+ 3. Introduction . . . . . . . . . . . . . . . . . . . . . . . . 3
+ 3.1. Data Planes supporting Segment Routing . . . . . . . . . 4
+ 3.2. Illustration . . . . . . . . . . . . . . . . . . . . . . 4
+ 4. Abstract Routing Model . . . . . . . . . . . . . . . . . . . 7
+ 4.1. Segment Routing Global Block (SRGB) . . . . . . . . . . . 8
+ 4.2. Traffic Engineering with SR . . . . . . . . . . . . . . . 9
+ 4.3. Segment Routing Database . . . . . . . . . . . . . . . . 10
+ 5. IPv6 Instantiation of Segment Routing . . . . . . . . . . . . 10
+ 5.1. Segment Identifiers (SIDs) and SRGB . . . . . . . . . . . 10
+ 5.1.1. Node-SID . . . . . . . . . . . . . . . . . . . . . . 11
+ 5.1.2. Adjacency-SID . . . . . . . . . . . . . . . . . . . . 11
+ 5.2. Segment Routing Extension Header (SRH) . . . . . . . . . 11
+ 5.2.1. SRH and RFC2460 behavior . . . . . . . . . . . . . . 15
+ 6. SRH Procedures . . . . . . . . . . . . . . . . . . . . . . . 15
+ 6.1. Segment Routing Operations . . . . . . . . . . . . . . . 15
+ 6.2. Segment Routing Node Functions . . . . . . . . . . . . . 16
+ 6.2.1. Ingress SR Node . . . . . . . . . . . . . . . . . . . 16
+ 6.2.2. Transit Non-SR Capable Node . . . . . . . . . . . . . 18
+ 6.2.3. SR Intra Segment Transit Node . . . . . . . . . . . . 18
+ 6.2.4. SR Segment Endpoint Node . . . . . . . . . . . . . . 18
+ 6.3. FRR Flag Settings . . . . . . . . . . . . . . . . . . . . 18
+ 7. SR and Tunneling . . . . . . . . . . . . . . . . . . . . . . 18
+ 8. Example Use Case . . . . . . . . . . . . . . . . . . . . . . 19
+ 9. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 21
+ 10. Manageability Considerations . . . . . . . . . . . . . . . . 21
+ 11. Security Considerations . . . . . . . . . . . . . . . . . . . 21
+ 12. Contributors . . . . . . . . . . . . . . . . . . . . . . . . 21
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 2]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ 13. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . 21
+ 14. References . . . . . . . . . . . . . . . . . . . . . . . . . 21
+ 14.1. Normative References . . . . . . . . . . . . . . . . . . 21
+ 14.2. Informative References . . . . . . . . . . . . . . . . . 21
+ Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . 22
+
+1. Structure of this document
+
+ Section 3 gives an introduction on SR for IPv6 networks.
+
+ Section 4 describes the Segment Routing abstract model.
+
+ Section 5 defines the Segment Routing Header (SRH) allowing
+ instantiation of SR over IPv6 dataplane.
+
+ Section 6 details the procedures of the Segment Routing Header.
+
+2. Segment Routing Documents
+
+ Segment Routing terminology is defined in
+ [I-D.filsfils-spring-segment-routing].
+
+ Segment Routing use cases are described in
+ [I-D.filsfils-spring-segment-routing-use-cases].
+
+ Segment Routing IPv6 use cases are described in
+ [I-D.ietf-spring-ipv6-use-cases].
+
+ Segment Routing protocol extensions are defined in
+ [I-D.ietf-isis-segment-routing-extensions], and
+ [I-D.psenak-ospf-segment-routing-ospfv3-extension].
+
+ The security mechanisms of the Segment Routing Header (SRH) are
+ described in [I-D.vyncke-6man-segment-routing-security].
+
+3. Introduction
+
+ Segment Routing (SR), defined in
+ [I-D.filsfils-spring-segment-routing], allows a node to steer a
+ packet through a controlled set of instructions, called segments, by
+ prepending a SR header to the packet. A segment can represent any
+ instruction, topological or service-based. SR allows to enforce a
+ flow through any path (topological or service/application based)
+ while maintaining per-flow state only at the ingress node to the SR
+ domain. Segments can be derived from different components: IGP, BGP,
+ Services, Contexts, Locators, etc. The list of segment forming the
+ path is called the Segment List and is encoded in the packet header.
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 3]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ SR allows the use of strict and loose source based routing paradigms
+ without requiring any additional signaling protocols in the
+ infrastructure hence delivering an excellent scalability property.
+
+ The source based routing model described in
+ [I-D.filsfils-spring-segment-routing] is inherited from the ones
+ proposed by [RFC1940] and [RFC2460]. The source based routing model
+ offers the support for explicit routing capability.
+
+3.1. Data Planes supporting Segment Routing
+
+ Segment Routing (SR), can be instantiated over MPLS
+ ([I-D.filsfils-spring-segment-routing-mpls]) and IPv6. This document
+ defines its instantiation over the IPv6 data-plane based on the use-
+ cases defined in [I-D.ietf-spring-ipv6-use-cases].
+
+ Segment Routing for IPv6 (SR-IPv6) is required in networks where MPLS
+ data-plane is not used or, when combined with SR-MPLS, in networks
+ where MPLS is used in the core and IPv6 is used at the edge (home
+ networks, datacenters).
+
+ This document defines a new type of Routing Header (originally
+ defined in [RFC2460]) called the Segment Routing Header (SRH) in
+ order to convey the Segment List in the packet header as defined in
+ [I-D.filsfils-spring-segment-routing]. Mechanisms through which
+ segment are known and advertised are outside the scope of this
+ document.
+
+3.2. Illustration
+
+ In the context of Figure 1 where all the links have the same IGP
+ cost, let us assume that a packet P enters the SR domain at an
+ ingress edge router I and that the operator requests the following
+ requirements for packet P:
+
+ The local service S offered by node B must be applied to packet P.
+
+ The links AB and CE cannot be used to transport the packet P.
+
+ Any node N along the journey of the packet should be able to
+ determine where the packet P entered the SR domain and where it
+ will exit. The intermediate node should be able to determine the
+ paths from the ingress edge router to itself, and from itself to
+ the egress edge router.
+
+ Per-flow State for packet P should only be created at the ingress
+ edge router.
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 4]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ The operator can forbid, for security reasons, anyone outside the
+ operator domain to exploit its intra-domain SR capabilities.
+
+ I---A---B---C---E
+ \ | / \ /
+ \ | / F
+ \|/
+ D
+
+ Figure 1: An illustration of SR properties
+
+ All these properties may be realized by instructing the ingress SR
+ edge router I to push the following abstract SR header on the packet
+ P.
+
+ +---------------------------------------------------------------+
+ | | |
+ | Abstract SR Header | |
+ | | |
+ | {SD, SB, SS, SF, SE}, Ptr, SI, SE | Transported |
+ | ^ | | Packet |
+ | | | | P |
+ | +---------------------+ | |
+ | | |
+ +---------------------------------------------------------------+
+
+ Figure 2: Packet P at node I
+
+ The abstract SR header contains a source route encoded as a list of
+ segments {SD, SB, SS, SF, SE}, a pointer (Ptr) and the identification
+ of the ingress and egress SR edge routers (segments SI and SE).
+
+ A segment identifies a topological instruction or a service
+ instruction. A segment can either be global or local. The
+ instruction associated with a global segment is recognized and
+ executed by any SR-capable node in the domain. The instruction
+ associated with a local segment is only supported by the specific
+ node that originates it.
+
+ Let us assume some IGP (i.e.: ISIS and OSPF) extensions to define a
+ "Node Segment" as a global instruction within the IGP domain to
+ forward a packet along the shortest path to the specified node. Let
+ us further assume that within the SR domain illustrated in Figure 1,
+ segments SI, SD, SB, SE and SF respectively identify IGP node
+ segments to I, D, B, E and F.
+
+ Let us assume that node B identifies its local service S with local
+ segment SS.
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 5]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ With all of this in mind, let us describe the journey of the packet
+ P.
+
+ The packet P reaches the ingress SR edge router. I pushes the SR
+ header illustrated in Figure 2 and sets the pointer to the first
+ segment of the list (SD).
+
+ SD is an instruction recognized by all the nodes in the SR domain
+ which causes the packet to be forwarded along the shortest path to D.
+
+ Once at D, the pointer is incremented and the next segment is
+ executed (SB).
+
+ SB is an instruction recognized by all the nodes in the SR domain
+ which causes the packet to be forwarded along the shortest path to B.
+
+ Once at B, the pointer is incremented and the next segment is
+ executed (SS).
+
+ SS is an instruction only recognized by node B which causes the
+ packet to receive service S.
+
+ Once the service applied, the next segment is executed (SF) which
+ causes the packet to be forwarded along the shortest path to F.
+
+ Once at F, the pointer is incremented and the next segment is
+ executed (SE).
+
+ SE is an instruction recognized by all the nodes in the SR domain
+ which causes the packet to be forwarded along the shortest path to E.
+
+ E then removes the SR header and the packet continues its journey
+ outside the SR domain.
+
+ All of the requirements are met.
+
+ First, the packet P has not used links AB and CE: the shortest-path
+ from I to D is I-A-D, the shortest-path from D to B is D-B, the
+ shortest-path from B to F is B-C-F and the shortest-path from F to E
+ is F-E, hence the packet path through the SR domain is I-A-D-B-C-F-E
+ and the links AB and CE have been avoided.
+
+ Second, the service S supported by B has been applied on packet P.
+
+ Third, any node along the packet path is able to identify the service
+ and topological journey of the packet within the SR domain. For
+ example, node C receives the packet illustrated in Figure 3 and hence
+ is able to infer where the packet entered the SR domain (SI), how it
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 6]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ got up to itself {SD, SB, SS, SE}, where it will exit the SR domain
+ (SE) and how it will do so {SF, SE}.
+
+ +---------------------------------------------------------------+
+ | | |
+ | SR Header | |
+ | | |
+ | {SD, SB, SS, SF, SE}, Ptr, SI, SE | Transported |
+ | ^ | | Packet |
+ | | | | P |
+ | +--------+ | |
+ | | |
+ +---------------------------------------------------------------+
+
+ Figure 3: Packet P at node C
+
+ Fourth, only node I maintains per-flow state for packet P. The
+ entire program of topological and service instructions to be executed
+ by the SR domain on packet P is encoded by the ingress edge router I
+ in the SR header in the form of a list of segments where each segment
+ identifies a specific instruction. No further per-flow state is
+ required along the packet path. The per-flow state is in the SR
+ header and travels with the packet. Intermediate nodes only hold
+ states related to the IGP global node segments and the local IGP
+ adjacency segments. These segments are not per-flow specific and
+ hence scale very well. Typically, an intermediate node would
+ maintain in the order of 100's to 1000's global node segments and in
+ the order of 10's to 100 of local adjacency segments. Typically the
+ SR IGP forwarding table is expected to be much less than 10000
+ entries.
+
+ Fifth, the SR header is inserted at the entrance to the domain and
+ removed at the exit of the operator domain. For security reasons,
+ the operator can forbid anyone outside its domain to use its intra-
+ domain SR capability.
+
+4. Abstract Routing Model
+
+ At the entrance of the SR domain, the ingress SR edge router pushes
+ the SR header on top of the packet. At the exit of the SR domain,
+ the egress SR edge router removes the SR header.
+
+ The abstract SR header contains an ordered list of segments, a
+ pointer identifying the next segment to process and the
+ identifications of the ingress and egress SR edge routers on the path
+ of this packet. The pointer identifies the segment that MUST be used
+ by the receiving router to process the packet. This segment is
+ called the active segment.
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 7]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ A property of SR is that the entire source route of the packet,
+ including the identity of the ingress and egress edge routers is
+ always available with the packet. This allows for interesting
+ accounting and service applications.
+
+ We define three SR-header operations:
+
+ "PUSH": an SR header is pushed on an IP packet, or additional
+ segments are added at the head of the segment list. The pointer
+ is moved to the first entry of the added segments.
+
+ "NEXT": the active segment is completed, the pointer is moved to
+ the next segment in the list.
+
+ "CONTINUE": the active segment is not completed, the pointer is
+ left unchanged.
+
+ In the future, other SR-header management operations may be defined.
+
+ As the packet travels through the SR domain, the pointer is
+ incremented through the ordered list of segments and the source route
+ encoded by the SR ingress edge node is executed.
+
+ A node processes an incoming packet according to the instruction
+ associated with the active segment.
+
+ Any instruction might be associated with a segment: for example, an
+ intra-domain topological strict or loose forwarding instruction, a
+ service instruction, etc.
+
+ At minimum, a segment instruction must define two elements: the
+ identity of the next-hop to forward the packet to (this could be the
+ same node or a context within the node) and which SR-header
+ management operation to execute.
+
+ Each segment is known in the network through a Segment Identifier
+ (SID). The terms "segment" and "SID" are interchangeable.
+
+4.1. Segment Routing Global Block (SRGB)
+
+ In the SR abstract model, a segment is identified by a Segment
+ Routing Identifier (SID). The SR abstract model doesn't mandate a
+ specific format for the SID (IPv6 address or other formats).
+
+ In Segment Routing IPv6 the SID is an IPv6 address. Therefore, the
+ SRGB is materialized by the global IPv6 address space which
+ represents the set of IPv6 routable addresses in the SR domain. The
+ following rules apply:
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 8]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ o Each node of the SR domain MUST be configured with the Segment
+ Routing Global Block (SRGB).
+
+ o All global segments must be allocated from the SRGB. Any SR
+ capable node MUST be able to process any global segment advertised
+ by any other node within the SR domain.
+
+ o Any segment outside the SRGB has a local significance and is
+ called a "local segment". An SR-capable node MUST be able to
+ process the local segments it originates. An SR-capable node MUST
+ NOT support the instruction associated with a local segment
+ originated by a remote node.
+
+4.2. Traffic Engineering with SR
+
+ An SR Traffic Engineering policy is composed of two elements: a flow
+ classification and a segment-list to prepend on the packets of the
+ flow.
+
+ In SR, this per-flow state only exists at the ingress edge node where
+ the policy is defined and the SR header is pushed.
+
+ It is outside the scope of the document to define the process that
+ leads to the instantiation at a node N of an SR Traffic Engineering
+ policy.
+
+ [I-D.filsfils-spring-segment-routing-use-cases] illustrates various
+ alternatives:
+
+ N is deriving this policy automatically (e.g. FRR).
+
+ N is provisioned explicitly by the operator.
+
+ N is provisioned by a controller or server (e.g.: SDN Controller).
+
+ N is provisioned by the operator with a high-level policy which is
+ mapped into a path thanks to a local CSPF-based computation (e.g.
+ affinity/SRLG exclusion).
+
+ N could also be provisioned by other means.
+
+ [I-D.filsfils-spring-segment-routing-use-cases] explains why the
+ majority of use-cases require very short segment-lists, hence
+ minimizing the performance impact, if any, of inserting and
+ transporting the segment list.
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 9]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ A SDN controller, which desires to instantiate at node N an SR
+ Traffic Engineering policy, collects the SR capability of node N such
+ as to ensure that the policy meets its capability.
+
+4.3. Segment Routing Database
+
+ The Segment routing Database (SRDB) is a set of entries where each
+ entry is identified by a SID. The instruction associated with each
+ entry at least defines the identity of the next-hop to which the
+ packet should be forwarded and what operation should be performed on
+ the SR header (PUSH, CONTINUE, NEXT).
+
+ +---------+-----------+---------------------------------+
+ | Segment | Next-Hop | SR Header operation |
+ +---------+-----------+---------------------------------+
+ | Sk | M | CONTINUE |
+ | Sj | N | NEXT |
+ | Sl | NAT Srvc | NEXT |
+ | Sm | FW srvc | NEXT |
+ | Sn | Q | NEXT |
+ | etc. | etc. | etc. |
+ +---------+-----------+---------------------------------+
+
+ Figure 4: SR Database
+
+ Each SR-capable node maintains its local SRDB. SRDB entries can
+ either derive from local policy or from protocol segment
+ advertisement.
+
+5. IPv6 Instantiation of Segment Routing
+
+5.1. Segment Identifiers (SIDs) and SRGB
+
+ Segment Routing, as described in
+ [I-D.filsfils-spring-segment-routing], defines Node-SID and
+ Adjacency-SID. When SR is used over IPv6 data-plane the following
+ applies.
+
+ The SRGB is the global IPv6 address space which represents the set of
+ IPv6 routable addresses in the SR domain.
+
+ Node SIDs are IPv6 addresses part of the SRGB (i.e.: routable
+ addresses). Adjacency-SIDs are IPv6 addresses which may not be part
+ of the global IPv6 address space.
+
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 10]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+5.1.1. Node-SID
+
+ The Node-SID identifies a node. With SR-IPv6 the Node-SID is an IPv6
+ prefix that the operator configured on the node and that is used as
+ the node identifier. Typically, in case of a router, this is the
+ IPv6 address of the node loopback interface. Therefore, SR-IPv6 does
+ not require any additional SID advertisement for the Node Segment.
+ The Node-SID is in fact the IPv6 address of the node.
+
+5.1.2. Adjacency-SID
+
+ In the SR architecture defined in
+ [I-D.filsfils-spring-segment-routing] the Adjacency-SID (or Adj-SID)
+ identifies a given interface and may be local or global (depending on
+ how it is advertised). A node may advertise one (or more) Adj-SIDs
+ allocated to a given interface so to force the forwarding of the
+ packet (when received with that particular Adj-SID) into the
+ interface regardless the routing entry for the packet destination.
+ The semantic of the Adj-SID is:
+
+ Send out the packet to the interface this prefix is allocated to.
+
+ When SR is applied to IPv6, any SID is in a global IPv6 address and
+ therefore, an Adj-SID has a global significance (i.e.: the IPv6
+ address representing the SID is a global address). In other words, a
+ node that advertises the Adj-SID in the form of a global IPv6 address
+ representing the link/adjacency the packet has to be forwarded to,
+ will apply to the Adj-SID a global significance.
+
+ Advertisement of Adj-SID may be done using multiple mechanisms among
+ which the ones described in ISIS and OSPF protocol extensions:
+ [I-D.ietf-isis-segment-routing-extensions] and
+ [I-D.psenak-ospf-segment-routing-ospfv3-extension]. The distinction
+ between local and global significance of the Adj-SID is given in the
+ encoding of the Adj-SID advertisement.
+
+5.2. Segment Routing Extension Header (SRH)
+
+ A new type of the Routing Header (originally defined in [RFC2460]) is
+ defined: the Segment Routing Header (SRH) which has a new Routing
+ Type, (suggested value 4) to be assigned by IANA.
+
+ As an example, if an explicit path is to be constructed across a core
+ network running ISIS or OSPF, the segment list will contain SIDs
+ representing the nodes across the path (loose or strict) which,
+ usually, are the IPv6 loopback interface address of each node. If
+ the path is across service or application entities, the segment list
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 11]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ contains the IPv6 addresses of these services or application
+ instances.
+
+ The Segment Routing Header (SRH) is defined as follows:
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Next Header | Hdr Ext Len | Routing Type | Segments Left |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | First Segment | Flags | HMAC Key ID |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Segment List[0] (128 bits ipv6 address) |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | |
+ ...
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Segment List[n] (128 bits ipv6 address) |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Policy List[0] (optional) |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Policy List[1] (optional) |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Policy List[2] (optional) |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | |
+ | |
+ | HMAC (256 bits) |
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 12]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ | (optional) |
+ | |
+ | |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ where:
+
+ o Next Header: 8-bit selector. Identifies the type of header
+ immediately following the SRH.
+
+ o Hdr Ext Len: 8-bit unsigned integer, is the length of the SRH
+ header in 8-octet units, not including the first 8 octets.
+
+ o Routing Type: TBD, to be assigned by IANA (suggested value: 4).
+
+ o Segments Left. Defined in [RFC2460], it contains the index, in
+ the Segment List, of the next segment to inspect. Segments Left
+ is decremented at each segment and it is used as an index in the
+ segment list.
+
+ o First Segment: offset in the SRH, not including the first 8 octets
+ and expressed in 16-octet units, pointing to the last element of
+ the segment list, which is in fact the first segment of the
+ segment routing path.
+
+ o Flags: 16 bits of flags. Following flags are defined:
+
+ 1
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |C|P|R|R| Policy Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ C-flag: Clean-up flag. Set when the SRH has to be removed from
+ the packet when packet reaches the last segment.
+
+ P-flag: Protected flag. Set when the packet has been rerouted
+ through FRR mechanism by a SR endpoint node. See Section 6.3
+ for more details.
+
+ R-flags. Reserved and for future use.
+
+ Policy Flags. Define the type of the IPv6 addresses encoded
+ into the Policy List (see below). The following have been
+ defined:
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 13]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ Bits 4-6: determine the type of the first element after the
+ segment list.
+
+ Bits 7-9: determine the type of the second element.
+
+ Bits 10-12: determine the type of the third element.
+
+ Bits 13-15: determine the type of the fourth element.
+
+ The following values are used for the type:
+
+ 0x0: Not present. If value is set to 0x0, it means the
+ element represented by these bits is not present.
+
+ 0x1: SR Ingress.
+
+ 0x2: SR Egress.
+
+ 0x3: Original Source Address.
+
+ o HMAC Key ID and HMAC field, and their use are defined in
+ [I-D.vyncke-6man-segment-routing-security].
+
+ o Segment List[n]: 128 bit IPv6 addresses representing the nth
+ segment in the Segment List. The Segment List is encoded starting
+ from the last segment of the path. I.e., the first element of the
+ segment list (Segment List [0]) contains the last segment of the
+ path while the last segment of the Segment List (Segment List[n])
+ contains the first segment of the path. The index contained in
+ "Segments Left" identifies the current active segment.
+
+ o Policy List. Optional addresses representing specific nodes in
+ the SR path such as:
+
+ SR Ingress: a 128 bit generic identifier representing the
+ ingress in the SR domain (i.e.: it needs not to be a valid IPv6
+ address).
+
+ SR Egress: a 128 bit generic identifier representing the egress
+ in the SR domain (i.e.: it needs not to be a valid IPv6
+ address).
+
+ Original Source Address: IPv6 address originally present in the
+ SA field of the packet.
+
+ The segments in the Policy List are encoded after the segment list
+ and they are optional. If none are in the SRH, all bits of the
+ Policy List Flags MUST be set to 0x0.
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 14]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+5.2.1. SRH and RFC2460 behavior
+
+ The SRH being a new type of the Routing Header, it also has the same
+ properties:
+
+ SHOULD only appear once in the packet.
+
+ Only the router whose address is in the DA field of the packet
+ header MUST inspect the SRH.
+
+ Therefore, Segment Routing in IPv6 networks implies that the segment
+ identifier (i.e.: the IPv6 address of the segment) is moved into the
+ DA of the packet.
+
+ The DA of the packet changes at each segment termination/completion
+ and therefore the original DA of the packet MUST be encoded as the
+ last segment of the path.
+
+ As illustrated in Section 3.2, nodes that are within the path of a
+ segment will forward packets based on the DA of the packet without
+ inspecting the SRH. This ensures full interoperability between SR-
+ capable and non-SR-capable nodes.
+
+6. SRH Procedures
+
+ In this section we describe the different procedures on the SRH.
+
+6.1. Segment Routing Operations
+
+ When Segment Routing is instantiated over the IPv6 data plane the
+ following applies:
+
+ o The segment list is encoded in the SRH.
+
+ o The active segment is in the destination address of the packet.
+
+ o The Segment Routing CONTINUE operation (as described in
+ [I-D.filsfils-spring-segment-routing]) is implemented as a
+ regular/plain IPv6 operation consisting of DA based forwarding.
+
+ o The NEXT operation is implemented through the update of the DA
+ with the value represented by the Next Segment field in the SRH.
+
+ o The PUSH operation is implemented through the insertion of the SRH
+ or the insertion of additional segments in the SRH segment list.
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 15]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+6.2. Segment Routing Node Functions
+
+ SR packets are forwarded to segments endpoints (i.e.: nodes whose
+ address is in the DA field of the packet). The segment endpoint,
+ when receiving a SR packet destined to itself, does:
+
+ o Inspect the SRH.
+
+ o Determine the next active segment.
+
+ o Update the Segments Left field (or, if requested, remove the SRH
+ from the packet).
+
+ o Update the DA.
+
+ o Send the packet to the next segment.
+
+ The procedures applied to the SRH are related to the node function.
+ Following nodes functions are defined:
+
+ Ingress SR Node.
+
+ Transit Non-SR Node.
+
+ Transit SR Intra Segment Node.
+
+ SR Endpoint Node.
+
+6.2.1. Ingress SR Node
+
+ Ingress Node can be a router at the edge of the SR domain or a SR-
+ capable host. The ingress SR node may obtain the segment list by
+ either:
+
+ Local path computation.
+
+ Local configuration.
+
+ Interaction with an SDN controller delivering the path as a
+ complete SRH.
+
+ Any other mechanism (mechanisms through which the path is acquired
+ are outside the scope of this document).
+
+ When creating the SRH (either at ingress node or in the SDN
+ controller) the following is done:
+
+ Next Header and Hdr Ext Len fields are set according to [RFC2460].
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 16]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ Routing Type field is set as TBD (SRH).
+
+ The Segment List is built with the FIRST segment of the path
+ encoded in the LAST element of the Segment List. Subsequent
+ segments are encoded on top of the first segment. Finally, the
+ LAST segment of the path is encoded in the FIRST element of the
+ Segment List. In other words, the Segment List is encoded in the
+ reverse order of the path.
+
+ The original DA of the packet is encoded as the last segment of
+ the path (encoded in the first element of the Segment List).
+
+ the DA of the packet is set with the value of the first segment
+ (found in the last element of the segment list).
+
+ the Segments Left field is set to n-1 where n is the number of
+ elements in the Segment List.
+
+ The packet is sent out towards the first segment (i.e.:
+ represented in the packet DA).
+
+6.2.1.1. Security at Ingress
+
+ The procedures related to the Segment Routing security are detailed
+ in [I-D.vyncke-6man-segment-routing-security].
+
+ In the case where the SR domain boundaries are not under control of
+ the network operator (e.g.: when the SR domain edge is in a home
+ network), it is important to authenticate and validate the content of
+ any SRH being received by the network operator. In such case, the
+ security procedure described in
+ [I-D.vyncke-6man-segment-routing-security] is to be used.
+
+ The ingress node (e.g.: the host in the home network) requests the
+ SRH from a control system (e.g.: an SDN controller) which delivers
+ the SRH with its HMAC signature on it.
+
+ Then, the home network host can send out SR packets (with an SRH on
+ it) that will be validated at the ingress of the network operator
+ infrastructure.
+
+ The ingress node of the network operator infrastructure, is
+ configured in order to validate the incoming SRH HMACs in order to
+ allow only packets having correct SRH according to their SA/DA
+ addresses.
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 17]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+6.2.2. Transit Non-SR Capable Node
+
+ SR is interoperable with plain IPv6 forwarding. Any non SR-capable
+ node will forward SR packets solely based on the DA. There's no SRH
+ inspection. This ensures full interoperability between SR and non-SR
+ nodes.
+
+6.2.3. SR Intra Segment Transit Node
+
+ Only the node whose address is in DA inspects and processes the SRH
+ (according to [RFC2460]). An intra segment transit node is not in
+ the DA and its forwarding is based on DA and its SR-IPv6 FIB.
+
+6.2.4. SR Segment Endpoint Node
+
+ The SR segment endpoint node is the node whose address is in the DA.
+ The segment endpoint node inspects the SRH and does:
+
+ 1. IF DA = myself (segment endpoint)
+ 2. IF Segments Left > 0 THEN
+ decrement Segments Left
+ update DA with Segment List[Segments Left]
+ 3. ELSE IF Segments List[Segments Left] <> DA THEN
+ update DA with Segments List[Segments Left]
+ IF Clean-up bit is set THEN remove the SRH
+ 4. ELSE give the packet to next PID (application)
+ End of processing.
+ 5. Forward the packet out
+
+6.3. FRR Flag Settings
+
+ A node supporting SR and doing Fast Reroute (as described in
+ [I-D.filsfils-spring-segment-routing-use-cases], when rerouting
+ packets through FRR mechanisms, SHOULD inspect the rerouted packet
+ header and look for the SRH. If the SRH is present, the rerouting
+ node SHOULD set the Protected bit on all rerouted packets.
+
+7. SR and Tunneling
+
+ Encapsulation can be realized in two different ways with SR-IPv6:
+
+ Outer encapsulation.
+
+ SRH with SA/DA original addresses.
+
+ Outer encapsulation tunneling is the traditional method where an
+ additional IPv6 header is prepended to the packet. The original IPv6
+ header being encapsulated, everything is preserved and the packet is
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 18]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ switched/routed according to the outer header (that could contain a
+ SRH).
+
+ SRH allows encoding both original SA and DA, hence an operator may
+ decide to change the SA/DA at ingress and restore them at egress.
+ This can be achieved without outer encapsulation, by changing SA/DA
+ and encoding the original SA in the Policy List and in the original
+ DA in the Segment List.
+
+8. Example Use Case
+
+ A more detailed description of use cases are available in
+ [I-D.ietf-spring-ipv6-use-cases]. In this section, a simple SR-IPv6
+ example is illustrated.
+
+ In the topology described in Figure 6 it is assumed an end-to-end SR
+ deployment. Therefore SR is supported by all nodes from A to J.
+
+ Home Network | Backbone | Datacenter
+ | |
+ | +---+ +---+ +---+ | +---+ |
+ +---|---| C |---| D |---| E |---|---| I |---|
+ | | +---+ +---+ +---+ | +---+ |
+ | | | | | | | | +---+
+ +---+ +---+ | | | | | | |--| X |
+ | A |---| B | | +---+ +---+ +---+ | +---+ | +---+
+ +---+ +---+ | | F |---| G |---| H |---|---| J |---|
+ | +---+ +---+ +---+ | +---+ |
+ | |
+ | +-----------+
+ | SDN |
+ | Orch/Ctlr |
+ +-----------+
+
+ Figure 6: Sample SR topology
+
+ The following workflow applies to packets sent by host A and destined
+ to server X.
+
+
+
+
+
+
+
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 19]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ . Host A sends a request for a path to server X to the SDN
+ controller or orchestration system.
+
+ . The SDN controller/orchestrator builds a SRH with:
+ . Segment List: C, F, J, X
+ . HMAC
+ that satisfies the requirements expressed in the request
+ by host A and based on policies applicable to host A.
+
+ . Host A receives the SRH and insert it into the packet.
+ The packet has now:
+ . SA: A
+ . DA: C
+ . SRH with
+ . SL: X, J, F, C
+ . Segments Left: 3 (i.e.: Segment List size - 1)
+ . PL: C (ingress), J (egress)
+ Note that X is the last segment and C is the
+ first segment (i.e.: the SL is encoded in the reverse
+ path order).
+ . HMAC
+
+ . When packet arrives in C (first segment), C does:
+ . Validate the HMAC of the SRH.
+ . Decrement Segments Left by one: 2
+ . Update the DA with the next segment found in
+ Segment List[2]. DA is set to F.
+ . Forward the packet to F.
+
+ . When packet arrives in F (second segment), F does:
+ . Decrement Segments Left by one: 1
+ . Update the DA with the next segment found in
+ Segment List[1]. DA is set to J.
+ . Forward the packet to J.
+
+ . Packet travels across G and H nodes which do plain
+ IPv6 forwarding based on DA. No inspection of SRH needs
+ to be done in these nodes. However, any SR capable node
+ is allowed to set the Protected bit in case of FRR
+ protection.
+
+ . When packet arrives in J (third segment), J does:
+ . Decrement Segments Left by one: 0
+ . Update the DA with the next segment found in
+ Segment List[0]. DA is set to X.
+ . If the cleanup bit is set, then node J will strip out
+ the SRH from the packet.
+ . Forward the packet to X.
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 20]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ The packet arrives in the server that may or may not support SR. The
+ return traffic, from server to host, may be sent using the same
+ procedures.
+
+9. IANA Considerations
+
+ TBD
+
+10. Manageability Considerations
+
+ TBD
+
+11. Security Considerations
+
+ Security mechanisms applied to Segment Routing over IPv6 networks are
+ detailed in [I-D.vyncke-6man-segment-routing-security].
+
+12. Contributors
+
+ The authors would like to thank Dave Barach, John Leddy, John
+ Brzozowski, Pierre Francois, Nagendra Kumar, Mark Townsley, Christian
+ Martin, Roberta Maglione, Eric Vyncke, James Connolly, David Lebrun
+ and Fred Baker for their contribution to this document.
+
+13. Acknowledgements
+
+ TBD
+
+14. References
+
+14.1. Normative References
+
+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
+ Requirement Levels", BCP 14, RFC 2119, March 1997.
+
+ [RFC2460] Deering, S. and R. Hinden, "Internet Protocol, Version 6
+ (IPv6) Specification", RFC 2460, December 1998.
+
+14.2. Informative References
+
+ [I-D.filsfils-spring-segment-routing]
+ Filsfils, C., Previdi, S., Bashandy, A., Decraene, B.,
+ Litkowski, S., Horneffer, M., Milojevic, I., Shakir, R.,
+ Ytti, S., Henderickx, W., Tantsura, J., and E. Crabbe,
+ "Segment Routing Architecture", draft-filsfils-spring-
+ segment-routing-04 (work in progress), July 2014.
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 21]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ [I-D.filsfils-spring-segment-routing-mpls]
+ Filsfils, C., Previdi, S., Bashandy, A., Decraene, B.,
+ Litkowski, S., Horneffer, M., Milojevic, I., Shakir, R.,
+ Ytti, S., Henderickx, W., Tantsura, J., and E. Crabbe,
+ "Segment Routing with MPLS data plane", draft-filsfils-
+ spring-segment-routing-mpls-03 (work in progress), August
+ 2014.
+
+ [I-D.filsfils-spring-segment-routing-use-cases]
+ Filsfils, C., Francois, P., Previdi, S., Decraene, B.,
+ Litkowski, S., Horneffer, M., Milojevic, I., Shakir, R.,
+ Ytti, S., Henderickx, W., Tantsura, J., Kini, S., and E.
+ Crabbe, "Segment Routing Use Cases", draft-filsfils-
+ spring-segment-routing-use-cases-01 (work in progress),
+ October 2014.
+
+ [I-D.ietf-isis-segment-routing-extensions]
+ Previdi, S., Filsfils, C., Bashandy, A., Gredler, H.,
+ Litkowski, S., Decraene, B., and J. Tantsura, "IS-IS
+ Extensions for Segment Routing", draft-ietf-isis-segment-
+ routing-extensions-03 (work in progress), October 2014.
+
+ [I-D.ietf-spring-ipv6-use-cases]
+ Brzozowski, J., Leddy, J., Leung, I., Previdi, S.,
+ Townsley, W., Martin, C., Filsfils, C., and R. Maglione,
+ "IPv6 SPRING Use Cases", draft-ietf-spring-ipv6-use-
+ cases-03 (work in progress), November 2014.
+
+ [I-D.psenak-ospf-segment-routing-ospfv3-extension]
+ Psenak, P., Previdi, S., Filsfils, C., Gredler, H.,
+ Shakir, R., Henderickx, W., and J. Tantsura, "OSPFv3
+ Extensions for Segment Routing", draft-psenak-ospf-
+ segment-routing-ospfv3-extension-02 (work in progress),
+ July 2014.
+
+ [I-D.vyncke-6man-segment-routing-security]
+ Vyncke, E. and S. Previdi, "IPv6 Segment Routing Header
+ (SRH) Security Considerations", July 2014.
+
+ [RFC1940] Estrin, D., Li, T., Rekhter, Y., Varadhan, K., and D.
+ Zappala, "Source Demand Routing: Packet Format and
+ Forwarding Specification (Version 1)", RFC 1940, May 1996.
+
+Authors' Addresses
+
+
+
+
+
+
+
+Previdi, et al. Expires June 12, 2015 [Page 22]
+
+Internet-Draft IPv6 Segment Routing Header (SRH) December 2014
+
+
+ Stefano Previdi (editor)
+ Cisco Systems, Inc.
+ Via Del Serafico, 200
+ Rome 00142
+ Italy
+
+ Email: sprevidi@cisco.com
+
+
+ Clarence Filsfils
+ Cisco Systems, Inc.
+ Brussels
+ BE
+
+ Email: cfilsfil@cisco.com
+
+
+ Brian Field
+ Comcast
+ 4100 East Dry Creek Road
+ Centennial, CO 80122
+ US
+
+ Email: Brian_Field@cable.comcast.com
+
+
+ Ida Leung
+ Rogers Communications
+ 8200 Dixie Road
+ Brampton, ON L6T 0C1
+ CA
+
+ Email: Ida.Leung@rci.rogers.com
diff --git a/vnet/vnet/sr/sr.c b/vnet/vnet/sr/sr.c
new file mode 100644
index 00000000000..5cc33dc93e6
--- /dev/null
+++ b/vnet/vnet/sr/sr.c
@@ -0,0 +1,2228 @@
+/*
+ * sr.c: ipv6 segment routing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/sr/sr.h>
+
+#include <openssl/hmac.h>
+
+ip6_sr_main_t sr_main;
+static vlib_node_registration_t sr_local_node;
+
+static void sr_fix_hmac (ip6_sr_main_t * sm, ip6_header_t * ip,
+ ip6_sr_header_t * sr)
+{
+ u32 key_index;
+ static u8 * keybuf;
+ u8 * copy_target;
+ int first_segment;
+ ip6_address_t *addrp;
+ int i;
+ ip6_sr_hmac_key_t * hmac_key;
+ u32 sig_len;
+
+ key_index = sr->hmac_key;
+
+ /* No signature? Pass... */
+ if (key_index == 0)
+ return;
+
+ /* We don't know about this key? Fail... */
+ if (key_index >= vec_len (sm->hmac_keys))
+ return;
+
+ hmac_key = sm->hmac_keys + key_index;
+
+ vec_reset_length (keybuf);
+
+ /* pkt ip6 src address */
+ vec_add2 (keybuf, copy_target, sizeof (ip6_address_t));
+ memcpy (copy_target, ip->src_address.as_u8, sizeof (ip6_address_t));
+
+ /* first segment */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0] = sr->first_segment;
+
+ /* octet w/ bit 0 = "clean" flag */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0]
+ = (sr->flags & clib_host_to_net_u16 (IP6_SR_HEADER_FLAG_CLEANUP))
+ ? 0x80 : 0;
+
+ /* hmac key id */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0] = sr->hmac_key;
+
+ first_segment = sr->first_segment;
+
+ addrp = sr->segments;
+
+ /* segments */
+ for (i = 0; i <= first_segment; i++)
+ {
+ vec_add2 (keybuf, copy_target, sizeof (ip6_address_t));
+ memcpy (copy_target, addrp->as_u8, sizeof (ip6_address_t));
+ addrp++;
+ }
+
+ addrp++;
+
+ HMAC_CTX_init(sm->hmac_ctx);
+ if (!HMAC_Init(sm->hmac_ctx, hmac_key->shared_secret,
+ vec_len(hmac_key->shared_secret),sm->md))
+ clib_warning ("barf1");
+ if (!HMAC_Update(sm->hmac_ctx,keybuf,vec_len(keybuf)))
+ clib_warning ("barf2");
+ if (!HMAC_Final(sm->hmac_ctx, (unsigned char *) addrp, &sig_len))
+ clib_warning ("barf3");
+ HMAC_CTX_cleanup(sm->hmac_ctx);
+}
+
+u8 * format_ip6_sr_header_flags (u8 * s, va_list * args)
+{
+ u16 flags = (u16) va_arg (*args, int);
+ u8 pl_flag;
+ int bswap_needed = va_arg (*args, int);
+ int i;
+
+ if (bswap_needed)
+ flags = clib_host_to_net_u16 (flags);
+
+ if (flags & IP6_SR_HEADER_FLAG_CLEANUP)
+ s = format (s, "cleanup ");
+
+ if (flags & IP6_SR_HEADER_FLAG_PROTECTED)
+ s = format (s, "reroute ");
+
+ s = format (s, "pl: ");
+ for (i = 1; i <= 4; i++)
+ {
+ pl_flag = ip6_sr_policy_list_flags (flags, i);
+ s = format (s, "[%d] ", i);
+
+ switch (pl_flag)
+ {
+ case IP6_SR_HEADER_FLAG_PL_ELT_NOT_PRESENT:
+ s = format (s, "NotPr ");
+ break;
+ case IP6_SR_HEADER_FLAG_PL_ELT_INGRESS_PE:
+ s = format (s, "InPE ");
+ break;
+ case IP6_SR_HEADER_FLAG_PL_ELT_EGRESS_PE:
+ s = format (s, "EgPE ");
+ break;
+
+ case IP6_SR_HEADER_FLAG_PL_ELT_ORIG_SRC_ADDR:
+ s = format (s, "OrgSrc ");
+ break;
+ }
+ }
+ return s;
+}
+
+u8 * format_ip6_sr_header (u8 * s, va_list * args)
+{
+ ip6_sr_header_t * h = va_arg (*args, ip6_sr_header_t *);
+ int print_hmac = va_arg (*args, int);
+ int i, pl_index, max_segs;
+ int flags_host_byte_order = clib_net_to_host_u16(h->flags);
+
+ s = format (s, "next proto %d, len %d, type %d",
+ h->protocol, (h->length<<3)+8, h->type);
+ s = format (s, "\n segs left %d, first_segment %d, hmac key %d",
+ h->segments_left, h->first_segment, h->hmac_key);
+ s = format (s, "\n flags %U", format_ip6_sr_header_flags,
+ flags_host_byte_order, 0 /* bswap needed */ );
+
+ /*
+ * Header length is in 8-byte units (minus one), so
+ * divide by 2 to ascertain the number of ip6 addresses in the
+ * segment list
+ */
+ max_segs = (h->length>>1);
+
+ if (!print_hmac && h->hmac_key)
+ max_segs -= 2;
+
+ s = format (s, "\n Segments (in processing order):");
+
+ for (i = h->first_segment; i >= 0; i--)
+ s = format (s, "\n %U", format_ip6_address, h->segments + i);
+
+ s = format (s, "\n Policy List:");
+
+ pl_index = 1; /* to match the RFC text */
+ for (i = (h->first_segment+1); i < max_segs; i++, pl_index++)
+ {
+ char * tag;
+ char * tags[] = {" ", "InPE: ", "EgPE: ", "OrgSrc: "};
+
+ tag = tags[0];
+ if (pl_index >=1 && pl_index <= 4)
+ {
+ int this_pl_flag = ip6_sr_policy_list_flags
+ (flags_host_byte_order, pl_index);
+ tag = tags[this_pl_flag];
+ }
+
+ s = format (s, "\n %s%U", tag, format_ip6_address, h->segments + i);
+ }
+
+ return s;
+}
+
+u8 * format_ip6_sr_header_with_length (u8 * s, va_list * args)
+{
+ ip6_header_t * h = va_arg (*args, ip6_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ uword header_bytes;
+
+ header_bytes = sizeof (h[0]) + sizeof (ip6_sr_header_t);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "ip6_sr header truncated");
+
+ s = format (s, "IP6: %U\n", format_ip6_header, h, max_header_bytes);
+ s = format (s, "SR: %U\n", format_ip6_sr_header, (ip6_sr_header_t *)(h+1),
+ 0 /* print_hmac */, max_header_bytes);
+ return s;
+}
+
+#define foreach_sr_rewrite_next \
+_(ERROR, "error-drop") \
+_(IP6_LOOKUP, "ip6-lookup") \
+_(SR_LOCAL, "sr-local")
+
+typedef enum {
+#define _(s,n) SR_REWRITE_NEXT_##s,
+ foreach_sr_rewrite_next
+#undef _
+ SR_REWRITE_N_NEXT,
+} sr_rewrite_next_t;
+
+typedef struct {
+ ip6_address_t src, dst;
+ u16 length;
+ u32 next_index;
+ u32 tunnel_index;
+ u8 sr[256];
+} sr_rewrite_trace_t;
+
+static char * sr_rewrite_error_strings[] = {
+#define sr_error(n,s) s,
+#include "sr_error.def"
+#undef sr_error
+};
+
+typedef enum {
+#define sr_error(n,s) SR_REWRITE_ERROR_##n,
+#include "sr_error.def"
+#undef sr_error
+ SR_REWRITE_N_ERROR,
+} sr_rewrite_error_t;
+
+
+u8 * format_sr_rewrite_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ sr_rewrite_trace_t * t = va_arg (*args, sr_rewrite_trace_t *);
+ ip6_main_t * im = &ip6_main;
+ ip6_sr_main_t * sm = &sr_main;
+ ip6_sr_tunnel_t *tun = pool_elt_at_index (sm->tunnels, t->tunnel_index);
+ ip6_fib_t * rx_fib, * tx_fib;
+
+ rx_fib = find_ip6_fib_by_table_index_or_id (im, tun->rx_fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX);
+
+ tx_fib = find_ip6_fib_by_table_index_or_id (im, tun->tx_fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX);
+
+ s = format
+ (s, "SR-REWRITE: next %s ip6 src %U dst %U len %u\n"
+ " rx-fib-id %d tx-fib-id %d\n%U",
+ (t->next_index == SR_REWRITE_NEXT_SR_LOCAL)
+ ? "sr-local" : "ip6-lookup",
+ format_ip6_address, &t->src,
+ format_ip6_address, &t->dst, t->length,
+ rx_fib->table_id, tx_fib->table_id,
+ format_ip6_sr_header, t->sr, 0 /* print_hmac */);
+ return s;
+}
+
+static uword
+sr_rewrite (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_sr_main_t * sm = &sr_main;
+ u32 (*sr_local_cb) (vlib_main_t *, vlib_node_runtime_t *,
+ vlib_buffer_t *, ip6_header_t *,
+ ip6_sr_header_t *);
+ sr_local_cb = sm->sr_local_cb;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ip6_header_t * ip0, * ip1;
+ ip_adjacency_t * adj0, * adj1;
+ ip6_sr_header_t * sr0, * sr1;
+ ip6_sr_tunnel_t * t0, *t1;
+ u64 * copy_src0, * copy_dst0;
+ u64 * copy_src1, * copy_dst1;
+ u32 next0 = SR_REWRITE_NEXT_IP6_LOOKUP;
+ u32 next1 = SR_REWRITE_NEXT_IP6_LOOKUP;
+ u16 new_l0, new_l1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /*
+ * $$$ parse through header(s) to pick the point
+ * where we punch in the SR extention header
+ */
+
+ adj0 = ip_get_adjacency (lm, vnet_buffer(b0)->ip.adj_index[VLIB_TX]);
+ adj1 = ip_get_adjacency (lm, vnet_buffer(b1)->ip.adj_index[VLIB_TX]);
+ t0 = pool_elt_at_index (sm->tunnels,
+ adj0->rewrite_header.sw_if_index);
+ t1 = pool_elt_at_index (sm->tunnels,
+ adj1->rewrite_header.sw_if_index);
+
+ ASSERT (VLIB_BUFFER_PRE_DATA_SIZE
+ >= ((word) vec_len (t0->rewrite)) + b0->current_data);
+ ASSERT (VLIB_BUFFER_PRE_DATA_SIZE
+ >= ((word) vec_len (t1->rewrite)) + b1->current_data);
+
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->tx_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->tx_fib_index;
+
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b1);
+
+ /*
+ * SR-unaware service chaining case: pkt coming back from
+ * service has the original dst address, and will already
+ * have an SR header. If so, send it to sr-local
+ */
+ if (PREDICT_FALSE(ip0->protocol == 43))
+ {
+ vlib_buffer_advance (b0, sizeof(ip0));
+ sr0 = (ip6_sr_header_t *) (ip0+1);
+ new_l0 = clib_net_to_host_u16(ip0->payload_length);
+ next0 = SR_REWRITE_NEXT_SR_LOCAL;
+ }
+ else
+ {
+ copy_dst0 = (u64 *)(((u8 *)ip0) - vec_len (t0->rewrite));
+ copy_src0 = (u64 *) ip0;
+
+ /*
+ * Copy data before the punch-in point left by the
+ * required amount. Assume (for the moment) that only
+ * the main packet header needs to be copied.
+ */
+ copy_dst0 [0] = copy_src0 [0];
+ copy_dst0 [1] = copy_src0 [1];
+ copy_dst0 [2] = copy_src0 [2];
+ copy_dst0 [3] = copy_src0 [3];
+ copy_dst0 [4] = copy_src0 [4];
+ vlib_buffer_advance (b0, - (word) vec_len(t0->rewrite));
+ ip0 = vlib_buffer_get_current (b0);
+ sr0 = (ip6_sr_header_t *) (ip0+1);
+ /* $$$ tune */
+ memcpy (sr0, t0->rewrite, vec_len (t0->rewrite));
+ /* Fix the next header chain */
+ sr0->protocol = ip0->protocol;
+ ip0->protocol = 43; /* routing extension header */
+ new_l0 = clib_net_to_host_u16(ip0->payload_length) +
+ vec_len (t0->rewrite);
+ ip0->payload_length = clib_host_to_net_u16(new_l0);
+ /* Rewrite the ip6 dst address */
+ ip0->dst_address.as_u64[0] = t0->first_hop.as_u64[0];
+ ip0->dst_address.as_u64[1] = t0->first_hop.as_u64[1];
+
+ sr_fix_hmac (sm, ip0, sr0);
+
+ next0 = sr_local_cb ? sr_local_cb (vm, node, b0, ip0, sr0) :
+ next0;
+
+ /*
+ * Ignore "do not rewrite" shtik in this path
+ */
+ if (PREDICT_FALSE (next0 & 0x80000000))
+ {
+ next0 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next0 == SR_REWRITE_NEXT_ERROR))
+ b0->error =
+ node->errors[SR_REWRITE_ERROR_APP_CALLBACK];
+ }
+ }
+
+ if (PREDICT_FALSE(ip1->protocol == 43))
+ {
+ vlib_buffer_advance (b1, sizeof(ip1));
+ sr1 = (ip6_sr_header_t *) (ip1+1);
+ new_l1 = clib_net_to_host_u16(ip1->payload_length);
+ next1 = SR_REWRITE_NEXT_SR_LOCAL;
+ }
+ else
+ {
+ copy_dst1 = (u64 *)(((u8 *)ip1) - vec_len (t1->rewrite));
+ copy_src1 = (u64 *) ip1;
+
+ copy_dst1 [0] = copy_src1 [0];
+ copy_dst1 [1] = copy_src1 [1];
+ copy_dst1 [2] = copy_src1 [2];
+ copy_dst1 [3] = copy_src1 [3];
+ copy_dst1 [4] = copy_src1 [4];
+ vlib_buffer_advance (b1, - (word) vec_len(t1->rewrite));
+ ip1 = vlib_buffer_get_current (b1);
+ sr1 = (ip6_sr_header_t *) (ip1+1);
+ memcpy (sr1, t1->rewrite, vec_len (t1->rewrite));
+ sr1->protocol = ip1->protocol;
+ ip1->protocol = 43;
+ new_l1 = clib_net_to_host_u16(ip1->payload_length) +
+ vec_len (t1->rewrite);
+ ip1->payload_length = clib_host_to_net_u16(new_l1);
+ ip1->dst_address.as_u64[0] = t1->first_hop.as_u64[0];
+ ip1->dst_address.as_u64[1] = t1->first_hop.as_u64[1];
+
+ sr_fix_hmac (sm, ip1, sr1);
+
+ next1 = sr_local_cb ? sr_local_cb (vm, node, b1, ip1, sr1) :
+ next1;
+
+ /*
+ * Ignore "do not rewrite" shtik in this path
+ */
+ if (PREDICT_FALSE (next1 & 0x80000000))
+ {
+ next1 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next1 == SR_REWRITE_NEXT_ERROR))
+ b1->error =
+ node->errors[SR_REWRITE_ERROR_APP_CALLBACK];
+ }
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_rewrite_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_index = t0 - sm->tunnels;
+ memcpy (tr->src.as_u8, ip0->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip0->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = new_l0;
+ tr->next_index = next0;
+ memcpy (tr->sr, sr0, sizeof (tr->sr));
+ }
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_rewrite_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ tr->tunnel_index = t1 - sm->tunnels;
+ memcpy (tr->src.as_u8, ip1->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip1->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = new_l1;
+ tr->next_index = next1;
+ memcpy (tr->sr, sr1, sizeof (tr->sr));
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip6_header_t * ip0;
+ ip_adjacency_t * adj0;
+ ip6_sr_header_t * sr0;
+ ip6_sr_tunnel_t * t0;
+ u64 * copy_src0, * copy_dst0;
+ u32 next0 = SR_REWRITE_NEXT_IP6_LOOKUP;
+ u16 new_l0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /*
+ * $$$ parse through header(s) to pick the point
+ * where we punch in the SR extention header
+ */
+
+ adj0 = ip_get_adjacency (lm, vnet_buffer(b0)->ip.adj_index[VLIB_TX]);
+ t0 = pool_elt_at_index (sm->tunnels,
+ adj0->rewrite_header.sw_if_index);
+
+ ASSERT (VLIB_BUFFER_PRE_DATA_SIZE
+ >= ((word) vec_len (t0->rewrite)) + b0->current_data);
+
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->tx_fib_index;
+
+ ip0 = vlib_buffer_get_current (b0);
+
+ /*
+ * SR-unaware service chaining case: pkt coming back from
+ * service has the original dst address, and will already
+ * have an SR header. If so, send it to sr-local
+ */
+ if (PREDICT_FALSE(ip0->protocol == 43))
+ {
+ vlib_buffer_advance (b0, sizeof(ip0));
+ sr0 = (ip6_sr_header_t *) (ip0+1);
+ new_l0 = clib_net_to_host_u16(ip0->payload_length);
+ next0 = SR_REWRITE_NEXT_SR_LOCAL;
+ }
+ else
+ {
+ copy_dst0 = (u64 *)(((u8 *)ip0) - vec_len (t0->rewrite));
+ copy_src0 = (u64 *) ip0;
+
+ /*
+ * Copy data before the punch-in point left by the
+ * required amount. Assume (for the moment) that only
+ * the main packet header needs to be copied.
+ */
+ copy_dst0 [0] = copy_src0 [0];
+ copy_dst0 [1] = copy_src0 [1];
+ copy_dst0 [2] = copy_src0 [2];
+ copy_dst0 [3] = copy_src0 [3];
+ copy_dst0 [4] = copy_src0 [4];
+ vlib_buffer_advance (b0, - (word) vec_len(t0->rewrite));
+ ip0 = vlib_buffer_get_current (b0);
+ sr0 = (ip6_sr_header_t *) (ip0+1);
+ /* $$$ tune */
+ memcpy (sr0, t0->rewrite, vec_len (t0->rewrite));
+ /* Fix the next header chain */
+ sr0->protocol = ip0->protocol;
+ ip0->protocol = 43; /* routing extension header */
+ new_l0 = clib_net_to_host_u16(ip0->payload_length) +
+ vec_len (t0->rewrite);
+ ip0->payload_length = clib_host_to_net_u16(new_l0);
+ /* Rewrite the ip6 dst address */
+ ip0->dst_address.as_u64[0] = t0->first_hop.as_u64[0];
+ ip0->dst_address.as_u64[1] = t0->first_hop.as_u64[1];
+
+ sr_fix_hmac (sm, ip0, sr0);
+
+ next0 = sr_local_cb ? sr_local_cb (vm, node, b0, ip0, sr0) :
+ next0;
+
+ /*
+ * Ignore "do not rewrite" shtik in this path
+ */
+ if (PREDICT_FALSE (next0 & 0x80000000))
+ {
+ next0 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next0 == SR_REWRITE_NEXT_ERROR))
+ b0->error =
+ node->errors[SR_REWRITE_ERROR_APP_CALLBACK];
+ }
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_rewrite_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ tr->tunnel_index = t0 - sm->tunnels;
+ memcpy (tr->src.as_u8, ip0->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip0->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = new_l0;
+ tr->next_index = next0;
+ memcpy (tr->sr, sr0, sizeof (tr->sr));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (sr_rewrite_node) = {
+ .function = sr_rewrite,
+ .name = "sr-rewrite",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .format_trace = format_sr_rewrite_trace,
+ .format_buffer = format_ip6_sr_header_with_length,
+
+ .n_errors = SR_REWRITE_N_ERROR,
+ .error_strings = sr_rewrite_error_strings,
+
+ .runtime_data_bytes = 0,
+
+ .n_next_nodes = SR_REWRITE_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [SR_REWRITE_NEXT_##s] = n,
+ foreach_sr_rewrite_next
+#undef _
+ },
+};
+
+static int ip6_delete_route_no_next_hop (ip6_address_t *dst_address_arg,
+ u32 dst_address_length,
+ u32 rx_table_id)
+{
+ ip6_add_del_route_args_t a;
+ ip6_address_t dst_address;
+ ip6_fib_t * fib;
+ ip6_main_t * im6 = &ip6_main;
+ BVT(clib_bihash_kv) kv, value;
+
+ fib = find_ip6_fib_by_table_index_or_id (im6, rx_table_id,
+ IP6_ROUTE_FLAG_TABLE_ID);
+ memset (&a, 0, sizeof (a));
+ a.flags |= IP4_ROUTE_FLAG_DEL;
+ a.dst_address_length = dst_address_length;
+
+ dst_address = *dst_address_arg;
+
+ ip6_address_mask (&dst_address,
+ &im6->fib_masks[dst_address_length]);
+
+ kv.key[0] = dst_address.as_u64[0];
+ kv.key[1] = dst_address.as_u64[1];
+ kv.key[2] = ((u64)((fib - im6->fibs))<<32) | dst_address_length;
+
+ if (BV(clib_bihash_search)(&im6->ip6_lookup_table, &kv, &value) < 0)
+ {
+ clib_warning ("%U/%d not in FIB",
+ format_ip6_address, &a.dst_address,
+ a.dst_address_length);
+ return -10;
+ }
+
+ a.adj_index = value.value;
+ a.dst_address = dst_address;
+
+ ip6_add_del_route (im6, &a);
+ ip6_maybe_remap_adjacencies (im6, rx_table_id, IP6_ROUTE_FLAG_TABLE_ID);
+ return 0;
+}
+
+static ip6_sr_hmac_key_t *
+find_or_add_shared_secret (ip6_sr_main_t * sm, u8 * secret, u32 * indexp)
+{
+ uword * p;
+ ip6_sr_hmac_key_t * key = 0;
+ int i;
+
+ p = hash_get_mem (sm->hmac_key_by_shared_secret, secret);
+
+ if (p)
+ {
+ key = vec_elt_at_index (sm->hmac_keys, p[0]);
+ if (indexp)
+ *indexp = p[0];
+ return (key);
+ }
+
+ /* Specific key ID? */
+ if (indexp && *indexp)
+ {
+ vec_validate (sm->hmac_keys, *indexp);
+ key = sm->hmac_keys + *indexp;
+ }
+ else
+ {
+ for (i = 0; i < vec_len (sm->hmac_keys); i++)
+ {
+ if (sm->hmac_keys[i].shared_secret == 0)
+ key = sm->hmac_keys + i;
+ goto found;
+ }
+ vec_validate (sm->hmac_keys, i);
+ key = sm->hmac_keys + i;
+ found:
+ ;
+ }
+
+ key->shared_secret = vec_dup (secret);
+
+ hash_set_mem (sm->hmac_key_by_shared_secret, key->shared_secret,
+ key - sm->hmac_keys);
+
+ if (indexp)
+ *indexp = key - sm->hmac_keys;
+ return (key);
+}
+
+int ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a)
+{
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip6_sr_tunnel_key_t key;
+ ip6_sr_tunnel_t * t;
+ uword * p;
+ ip6_sr_header_t * h = 0;
+ u32 header_length;
+ ip6_address_t * addrp, *this_address;
+ ip_adjacency_t adj, * ap, * add_adj = 0;
+ u32 adj_index;
+ ip6_sr_main_t * sm = &sr_main;
+ u8 * key_copy;
+ u32 rx_fib_index, tx_fib_index;
+ ip6_add_del_route_args_t aa;
+ u32 hmac_key_index_u32;
+ u8 hmac_key_index = 0;
+
+ /* Make sure that the rx FIB exists */
+ p = hash_get (im->fib_index_by_table_id, a->rx_table_id);
+
+ if (p == 0)
+ return -3;
+
+ /* remember the FIB index */
+ rx_fib_index = p[0];
+
+ /* Make sure that the supplied FIB exists */
+ p = hash_get (im->fib_index_by_table_id, a->tx_table_id);
+
+ if (p == 0)
+ return -4;
+
+ /* remember the FIB index */
+ tx_fib_index = p[0];
+
+ memcpy (key.src.as_u8, a->src_address->as_u8, sizeof (key.src));
+ memcpy (key.dst.as_u8, a->dst_address->as_u8, sizeof (key.dst));
+
+ p = hash_get_mem (sm->tunnel_index_by_key, &key);
+
+ if (p)
+ {
+ if (a->is_del)
+ {
+ hash_pair_t *hp;
+
+ /* Delete existing tunnel */
+ t = pool_elt_at_index (sm->tunnels, p[0]);
+
+ ip6_delete_route_no_next_hop (&t->key.dst, t->dst_mask_width,
+ a->rx_table_id);
+ vec_free (t->rewrite);
+ pool_put (sm->tunnels, t);
+ hp = hash_get_pair (sm->tunnel_index_by_key, &key);
+ key_copy = (void *)(hp->key);
+ hash_unset_mem (sm->tunnel_index_by_key, &key);
+ vec_free (key_copy);
+ return 0;
+ }
+ else /* create; tunnel already exists; complain */
+ return -1;
+ }
+ else
+ {
+ /* delete; tunnel does not exist; complain */
+ if (a->is_del)
+ return -2;
+ }
+
+ /* create a new tunnel */
+ pool_get (sm->tunnels, t);
+ memset (t, 0, sizeof (*t));
+
+ memcpy (&t->key, &key, sizeof (t->key));
+ t->dst_mask_width = a->dst_mask_width;
+ t->rx_fib_index = rx_fib_index;
+ t->tx_fib_index = tx_fib_index;
+
+ /* The first specified hop goes right into the dst address */
+ if (vec_len(a->segments))
+ {
+ t->first_hop = a->segments[0];
+ /* It won't feel nice if we do it twice */
+ vec_delete (a->segments, 1, 0);
+ }
+ else /* there must be at least one segment... */
+ return -4;
+
+ /*
+ * Create the sr header rewrite string
+ * We append the dst address to the set of next hops
+ * so the ultimate recipient can tell where the
+ * packet entered the SR domain
+ */
+ header_length = sizeof (*h) +
+ sizeof (ip6_address_t) * (vec_len (a->segments) + vec_len (a->tags));
+
+ if (a->shared_secret)
+ {
+ /* Allocate a new key slot if we don't find the secret key */
+ hmac_key_index_u32 = 0;
+ (void) find_or_add_shared_secret (sm, a->shared_secret,
+ &hmac_key_index_u32);
+
+ /* Hey Vinz Clortho: Gozzer is pissed.. you're out of keys! */
+ if (hmac_key_index_u32 >= 256)
+ return -5;
+ hmac_key_index = hmac_key_index_u32;
+ header_length += SHA256_DIGEST_LENGTH;
+ }
+
+ vec_validate (t->rewrite, header_length-1);
+
+ h = (ip6_sr_header_t *) t->rewrite;
+
+ h->protocol = 0xFF; /* we don't know yet */
+
+ h->length = (header_length/8) - 1;
+ h->type = ROUTING_HEADER_TYPE_SR;
+ h->segments_left = vec_len (a->segments);
+ h->first_segment = vec_len(a->segments) -1;
+ if (a->shared_secret)
+ h->hmac_key = hmac_key_index & 0xFF;
+
+ h->flags = a->flags_net_byte_order;
+
+ /* Paint on the segment list, in reverse */
+ addrp = h->segments + (vec_len (a->segments) - 1);
+
+ vec_foreach (this_address, a->segments)
+ {
+ memcpy (addrp->as_u8, this_address->as_u8, sizeof (ip6_address_t));
+ addrp--;
+ }
+
+ /* Paint on the tag list, not reversed */
+ addrp = h->segments + vec_len(a->segments);
+
+ vec_foreach (this_address, a->tags)
+ {
+ memcpy (addrp->as_u8, this_address->as_u8, sizeof (ip6_address_t));
+ addrp++;
+ }
+
+ key_copy = vec_new (ip6_sr_tunnel_key_t, 1);
+ memcpy (key_copy, &key, sizeof (ip6_sr_tunnel_key_t));
+ hash_set_mem (sm->tunnel_index_by_key, key_copy, t - sm->tunnels);
+
+ memset(&adj, 0, sizeof (adj));
+
+ /* Create an adjacency and add to v6 fib */
+ adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE;
+ adj.lookup_next_index = sm->ip6_lookup_sr_next_index;
+ adj.explicit_fib_index = ~0;
+
+ ap = ip_add_adjacency (lm, &adj, 1 /* one adj */,
+ &adj_index);
+
+ /*
+ * Stick the tunnel index into the rewrite header.
+ *
+ * Unfortunately, inserting an SR header according to the various
+ * RFC's requires parsing through the ip6 header, perhaps consing a
+ * buffer onto the head of the vlib_buffer_t, etc. We don't use the
+ * normal reverse bcopy rewrite code.
+ *
+ * We don't handle ugly RFC-related cases yet, but I'm sure PL will complain
+ * at some point...
+ */
+ ap->rewrite_header.sw_if_index = t - sm->tunnels;
+
+ vec_add1 (add_adj, ap[0]);
+
+ memcpy (aa.dst_address.as_u8, a->dst_address, sizeof (aa.dst_address.as_u8));
+ aa.dst_address_length = a->dst_mask_width;
+
+ aa.flags = (a->is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD);
+ aa.flags |= IP6_ROUTE_FLAG_FIB_INDEX;
+ aa.table_index_or_table_id = rx_fib_index;
+ aa.add_adj = add_adj;
+ aa.adj_index = adj_index;
+ aa.n_add_adj = 1;
+ ip6_add_del_route (im, &aa);
+ vec_free (add_adj);
+
+ return 0;
+}
+
+static clib_error_t *
+sr_add_del_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ int is_del = 0;
+ ip6_address_t src_address;
+ int src_address_set = 0;
+ ip6_address_t dst_address;
+ u32 dst_mask_width;
+ int dst_address_set = 0;
+ u16 flags = 0;
+ u8 *shared_secret = 0;
+ u32 rx_table_id = 0;
+ u32 tx_table_id = 0;
+ ip6_address_t * segments = 0;
+ ip6_address_t * this_seg;
+ ip6_address_t * tags = 0;
+ ip6_address_t * this_tag;
+ ip6_sr_add_del_tunnel_args_t _a, *a=&_a;
+ ip6_address_t next_address, tag;
+ int pl_index;
+ int rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "del"))
+ is_del = 1;
+ else if (unformat (input, "rx-fib-id %d", &rx_table_id))
+ ;
+ else if (unformat (input, "tx-fib-id %d", &tx_table_id))
+ ;
+ else if (unformat (input, "src %U", unformat_ip6_address, &src_address))
+ src_address_set = 1;
+ else if (unformat (input, "dst %U/%d",
+ unformat_ip6_address, &dst_address,
+ &dst_mask_width))
+ dst_address_set = 1;
+ else if (unformat (input, "next %U", unformat_ip6_address,
+ &next_address))
+ {
+ vec_add2 (segments, this_seg, 1);
+ memcpy (this_seg->as_u8, next_address.as_u8, sizeof (*this_seg));
+ }
+ else if (unformat (input, "tag %U", unformat_ip6_address,
+ &tag))
+ {
+ vec_add2 (tags, this_tag, 1);
+ memcpy (this_tag->as_u8, tag.as_u8, sizeof (*this_tag));
+ }
+ else if (unformat (input, "clean"))
+ flags |= IP6_SR_HEADER_FLAG_CLEANUP;
+ else if (unformat (input, "protected"))
+ flags |= IP6_SR_HEADER_FLAG_PROTECTED;
+ else if (unformat (input, "key %s", &shared_secret))
+ /* Do not include the trailing NULL byte. Guaranteed interop issue */
+ _vec_len (shared_secret) -= 1;
+ else if (unformat (input, "InPE %d", &pl_index))
+ {
+ if (pl_index <= 0 || pl_index > 4)
+ {
+ pl_index_range_error:
+ return clib_error_return
+ (0, "Policy List Element Index %d out of range (1-4)", pl_index);
+
+ }
+ flags |= IP6_SR_HEADER_FLAG_PL_ELT_INGRESS_PE
+ << ip6_sr_policy_list_shift_from_index (pl_index);
+ }
+ else if (unformat (input, "EgPE %d", &pl_index))
+ {
+ if (pl_index <= 0 || pl_index > 4)
+ goto pl_index_range_error;
+ flags |= IP6_SR_HEADER_FLAG_PL_ELT_EGRESS_PE
+ << ip6_sr_policy_list_shift_from_index (pl_index);
+ }
+ else if (unformat (input, "OrgSrc %d", &pl_index))
+ {
+ if (pl_index <= 0 || pl_index > 4)
+ goto pl_index_range_error;
+ flags |= IP6_SR_HEADER_FLAG_PL_ELT_ORIG_SRC_ADDR
+ << ip6_sr_policy_list_shift_from_index (pl_index);
+ }
+ else
+ break;
+ }
+
+ if (!src_address_set)
+ return clib_error_return (0, "src address required");
+
+ if (!dst_address_set)
+ return clib_error_return (0, "dst address required");
+
+ if (!segments)
+ return clib_error_return (0, "at least one sr segment required");
+
+ memset (a, 0, sizeof (*a));
+ a->src_address = &src_address;
+ a->dst_address = &dst_address;
+ a->dst_mask_width = dst_mask_width;
+ a->segments = segments;
+ a->tags = tags;
+ a->flags_net_byte_order = clib_host_to_net_u16(flags);
+ a->is_del = is_del;
+ a->rx_table_id = rx_table_id;
+ a->tx_table_id = tx_table_id;
+ a->shared_secret = shared_secret;
+
+ rv = ip6_sr_add_del_tunnel (a);
+
+ vec_free (segments);
+ vec_free (tags);
+ vec_free (shared_secret);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ case -1:
+ return clib_error_return (0, "SR tunnel src %U dst %U already exists",
+ format_ip6_address, &src_address,
+ format_ip6_address, &dst_address);
+
+ case -2:
+ return clib_error_return (0, "SR tunnel src %U dst %U does not exist",
+ format_ip6_address, &src_address,
+ format_ip6_address, &dst_address);
+
+ case -3:
+ return clib_error_return (0, "FIB table %d does not exist", rx_table_id);
+
+ case -4:
+ return clib_error_return (0, "At least one segment is required");
+
+ default:
+ return clib_error_return (0, "BUG: ip6_sr_add_del_tunnel returns %d",
+ rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (sr_tunnel_command, static) = {
+ .path = "sr tunnel",
+ .short_help =
+ "sr tunnel [del] <src> <dst> [next <addr>] [cleanup] [reroute] [key %s]",
+ .function = sr_add_del_tunnel_command_fn,
+};
+
+
+static clib_error_t *
+show_sr_tunnel_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ static ip6_sr_tunnel_t ** tunnels;
+ ip6_sr_tunnel_t * t;
+ ip6_sr_main_t * sm = &sr_main;
+ ip6_main_t * im = &ip6_main;
+ ip6_fib_t * rx_fib, * tx_fib;
+ int i;
+
+ vec_reset_length (tunnels);
+
+ pool_foreach (t, sm->tunnels,
+ ({
+ vec_add1 (tunnels, t);
+ }));
+
+ if (vec_len (tunnels) == 0)
+ vlib_cli_output (vm, "No SR tunnels configured");
+
+ for (i = 0; i < vec_len (tunnels); i++)
+ {
+ t = tunnels [i];
+
+ rx_fib = find_ip6_fib_by_table_index_or_id (im, t->rx_fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX);
+
+ tx_fib = find_ip6_fib_by_table_index_or_id (im, t->tx_fib_index,
+ IP6_ROUTE_FLAG_FIB_INDEX);
+
+ vlib_cli_output (vm, "src %U dst %U first hop %U",
+ format_ip6_address, &t->key.src,
+ format_ip6_address, &t->key.dst,
+ format_ip6_address, &t->first_hop);
+ vlib_cli_output (vm, " rx-fib-id %d tx-fib-id %d",
+ rx_fib->table_id, tx_fib->table_id);
+ vlib_cli_output (vm, " sr: %U", format_ip6_sr_header, t->rewrite,
+ 0 /* print_hmac */);
+ vlib_cli_output (vm, "-------");
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_sr_tunnel_command, static) = {
+ .path = "show sr tunnel",
+ .short_help = "show sr tunnel",
+ .function = show_sr_tunnel_fn,
+};
+
+#define foreach_sr_fix_dst_addr_next \
+_(DROP, "error-drop")
+
+typedef enum {
+#define _(s,n) SR_FIX_DST_ADDR_NEXT_##s,
+foreach_sr_fix_dst_addr_next
+#undef _
+ SR_FIX_DST_ADDR_N_NEXT,
+} sr_fix_dst_addr_next_t;
+
+static char * sr_fix_dst_error_strings[] = {
+#define sr_fix_dst_error(n,s) s,
+#include "sr_fix_dst_error.def"
+#undef sr_fix_dst_error
+};
+
+typedef enum {
+#define sr_fix_dst_error(n,s) SR_FIX_DST_ERROR_##n,
+#include "sr_fix_dst_error.def"
+#undef sr_fix_dst_error
+ SR_FIX_DST_N_ERROR,
+} sr_fix_dst_error_t;
+
+typedef struct {
+ ip6_address_t src, dst;
+ u32 next_index;
+ u32 adj_index;
+ u8 sr[256];
+} sr_fix_addr_trace_t;
+
+u8 * format_sr_fix_addr_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ sr_fix_addr_trace_t * t = va_arg (*args, sr_fix_addr_trace_t *);
+ vnet_hw_interface_t * hi = 0;
+ ip_adjacency_t * adj;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ vnet_main_t * vnm = vnet_get_main();
+
+ if (t->adj_index != ~0)
+ {
+ adj = ip_get_adjacency (lm, t->adj_index);
+ hi = vnet_get_sup_hw_interface (vnm, adj->rewrite_header.sw_if_index);
+ }
+
+ s = format (s, "SR-FIX_ADDR: next %s ip6 src %U dst %U\n",
+ (t->next_index == SR_FIX_DST_ADDR_NEXT_DROP)
+ ? "drop" : "output",
+ format_ip6_address, &t->src,
+ format_ip6_address, &t->dst);
+ if (t->next_index != SR_FIX_DST_ADDR_NEXT_DROP)
+ {
+ s = format (s, "%U\n", format_ip6_sr_header, t->sr, 1 /* print_hmac */);
+ s = format (s, " output via %s", hi ? (char *)(hi->name)
+ : "Invalid adj");
+ }
+ return s;
+}
+
+static uword
+sr_fix_dst_addr (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+#if 0
+ while (0 && n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ __attribute__((unused)) vlib_buffer_t * b0, * b1;
+ u32 next0 = SR_FIX_DST_ADDR_NEXT_DROP;
+ u32 next1 = SR_FIX_DST_ADDR_NEXT_DROP;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+#endif
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip6_header_t * ip0;
+ ip_adjacency_t * adj0;
+ ip6_sr_header_t * sr0;
+ u32 next0 = SR_FIX_DST_ADDR_NEXT_DROP;
+ ip6_address_t *new_dst0;
+ ethernet_header_t * eh0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ adj0 = ip_get_adjacency (lm, vnet_buffer(b0)->ip.adj_index[VLIB_TX]);
+ next0 = adj0->mcast_group_index;
+
+ /* We should be pointing at an Ethernet header... */
+ eh0 = vlib_buffer_get_current (b0);
+ ip0 = (ip6_header_t *)(eh0+1);
+ sr0 = (ip6_sr_header_t *) (ip0+1);
+
+ /* We'd better find an SR header... */
+ if (PREDICT_FALSE(ip0->protocol != 43))
+ {
+ b0->error = node->errors[SR_FIX_DST_ERROR_NO_SR_HEADER];
+ goto do_trace0;
+ }
+ else
+ {
+ /*
+ * We get here from sr_rewrite or sr_local, with
+ * sr->segments_left pointing at the (copy of the original) dst
+ * address. Use it, then increment sr0->segments_left.
+ */
+
+ /* Out of segments? Turf the packet */
+ if (PREDICT_FALSE (sr0->segments_left == 0))
+ {
+ b0->error = node->errors[SR_FIX_DST_ERROR_NO_MORE_SEGMENTS];
+ goto do_trace0;
+ }
+
+ /*
+ * Rewrite the packet with the original dst address
+ * We assume that the last segment (in processing order) contains
+ * the original dst address. The list is reversed, so sr0->segments
+ * contains the original dst address.
+ */
+ new_dst0 = sr0->segments;
+ ip0->dst_address.as_u64[0] = new_dst0->as_u64[0];
+ ip0->dst_address.as_u64[1] = new_dst0->as_u64[1];
+ }
+
+ do_trace0:
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_fix_addr_trace_t *t = vlib_add_trace (vm, node,
+ b0, sizeof (*t));
+ t->next_index = next0;
+ t->adj_index = ~0;
+
+ if (next0 != SR_FIX_DST_ADDR_NEXT_DROP)
+ {
+ t->adj_index = vnet_buffer(b0)->ip.adj_index[VLIB_TX];
+ memcpy (t->src.as_u8, ip0->src_address.as_u8,
+ sizeof (t->src.as_u8));
+ memcpy (t->dst.as_u8, ip0->dst_address.as_u8,
+ sizeof (t->dst.as_u8));
+ memcpy (t->sr, sr0, sizeof (t->sr));
+ }
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ return from_frame->n_vectors;
+}
+
+
+VLIB_REGISTER_NODE (sr_fix_dst_addr_node) = {
+ .function = sr_fix_dst_addr,
+ .name = "sr-fix-dst-addr",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .format_trace = format_sr_fix_addr_trace,
+ .format_buffer = format_ip6_sr_header_with_length,
+
+ .runtime_data_bytes = 0,
+
+ .n_errors = SR_FIX_DST_N_ERROR,
+ .error_strings = sr_fix_dst_error_strings,
+
+ .n_next_nodes = SR_FIX_DST_ADDR_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [SR_FIX_DST_ADDR_NEXT_##s] = n,
+ foreach_sr_fix_dst_addr_next
+#undef _
+ },
+};
+
+static clib_error_t * sr_init (vlib_main_t * vm)
+{
+ ip6_sr_main_t * sm = &sr_main;
+ clib_error_t * error = 0;
+ vlib_node_t * ip6_lookup_node, * ip6_rewrite_node;
+ vlib_node_t * ip6_rewrite_local_node;
+ u32 verify_next_index;
+
+ if ((error = vlib_call_init_function (vm, ip_main_init)))
+ return error;
+
+ if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
+ return error;
+
+ sm->vlib_main = vm;
+ sm->vnet_main = vnet_get_main();
+
+ vec_validate (sm->hmac_keys, 0);
+ sm->hmac_keys[0].shared_secret = (u8 *) 0xdeadbeef;
+
+ sm->tunnel_index_by_key =
+ hash_create_mem (0, sizeof (ip6_sr_tunnel_key_t), sizeof (uword));
+
+ sm->hmac_key_by_shared_secret = hash_create_string (0, sizeof(uword));
+
+ ip6_register_protocol (43, sr_local_node.index);
+
+ ip6_lookup_node = vlib_get_node_by_name (vm, (u8 *)"ip6-lookup");
+ ASSERT(ip6_lookup_node);
+
+ ip6_rewrite_node = vlib_get_node_by_name (vm, (u8 *)"ip6-rewrite");
+ ASSERT(ip6_rewrite_node);
+
+ ip6_rewrite_local_node = vlib_get_node_by_name (vm,
+ (u8 *)"ip6-rewrite-local");
+ ASSERT(ip6_rewrite_local_node);
+
+ /* Add a disposition to ip6_lookup for the sr rewrite node */
+ sm->ip6_lookup_sr_next_index =
+ vlib_node_add_next (vm, ip6_lookup_node->index, sr_rewrite_node.index);
+
+ /* Add a disposition to ip6_rewrite for the sr dst address hack node */
+ sm->ip6_rewrite_sr_next_index =
+ vlib_node_add_next (vm, ip6_rewrite_node->index,
+ sr_fix_dst_addr_node.index);
+ /*
+ * Fix ip6-rewrite-local, sibling of the above. The sibling bitmap
+ * isn't set up at this point, so we have to do it manually
+ */
+ verify_next_index = vlib_node_add_next
+ (vm, ip6_rewrite_local_node->index,
+ sr_fix_dst_addr_node.index);
+
+ ASSERT(sm->ip6_rewrite_sr_next_index == verify_next_index);
+
+ OpenSSL_add_all_digests();
+
+ sm->md = (void *) EVP_get_digestbyname ("sha1");
+ sm->hmac_ctx = clib_mem_alloc (sizeof (HMAC_CTX));
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (sr_init);
+
+#define foreach_sr_local_next \
+ _ (ERROR, "error-drop") \
+ _ (IP6_LOOKUP, "ip6-lookup")
+
+typedef enum {
+#define _(s,n) SR_LOCAL_NEXT_##s,
+ foreach_sr_local_next
+#undef _
+ SR_LOCAL_N_NEXT,
+} sr_local_next_t;
+
+typedef struct {
+ u8 next_index;
+ u8 sr_valid;
+ ip6_address_t src, dst;
+ u16 length;
+ u8 sr[256];
+} sr_local_trace_t;
+
+static char * sr_local_error_strings[] = {
+#define sr_error(n,s) s,
+#include "sr_error.def"
+#undef sr_error
+};
+
+typedef enum {
+#define sr_error(n,s) SR_LOCAL_ERROR_##n,
+#include "sr_error.def"
+#undef sr_error
+ SR_LOCAL_N_ERROR,
+} sr_local_error_t;
+
+u8 * format_sr_local_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ sr_local_trace_t * t = va_arg (*args, sr_local_trace_t *);
+
+ s = format (s, "SR-LOCAL: src %U dst %U len %u next_index %d",
+ format_ip6_address, &t->src,
+ format_ip6_address, &t->dst, t->length, t->next_index);
+ if (t->sr_valid)
+ s = format (s, "\n %U", format_ip6_sr_header, t->sr, 1 /* print_hmac */);
+ else
+ s = format (s, "\n popped SR header");
+
+ return s;
+}
+
+
+/* $$$$ fixme: smp, don't copy data, cache input, output (maybe) */
+
+static int sr_validate_hmac (ip6_sr_main_t * sm, ip6_header_t * ip,
+ ip6_sr_header_t * sr)
+{
+ u32 key_index;
+ static u8 * keybuf;
+ u8 * copy_target;
+ int first_segment;
+ ip6_address_t *addrp;
+ int i;
+ ip6_sr_hmac_key_t * hmac_key;
+ static u8 * signature;
+ u32 sig_len;
+
+ key_index = sr->hmac_key;
+
+ /* No signature? Pass... */
+ if (key_index == 0)
+ return 0;
+
+ /* We don't know about this key? Fail... */
+ if (key_index >= vec_len (sm->hmac_keys))
+ return 1;
+
+ vec_validate (signature, SHA256_DIGEST_LENGTH-1);
+
+ hmac_key = sm->hmac_keys + key_index;
+
+ vec_reset_length (keybuf);
+
+ /* pkt ip6 src address */
+ vec_add2 (keybuf, copy_target, sizeof (ip6_address_t));
+ memcpy (copy_target, ip->src_address.as_u8, sizeof (ip6_address_t));
+
+ /* last segment */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0] = sr->first_segment;
+
+ /* octet w/ bit 0 = "clean" flag */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0]
+ = (sr->flags & clib_host_to_net_u16 (IP6_SR_HEADER_FLAG_CLEANUP))
+ ? 0x80 : 0;
+
+ /* hmac key id */
+ vec_add2 (keybuf, copy_target, 1);
+ copy_target[0] = sr->hmac_key;
+
+ first_segment = sr->first_segment;
+
+ addrp = sr->segments;
+
+ /* segments */
+ for (i = 0; i <= first_segment; i++)
+ {
+ vec_add2 (keybuf, copy_target, sizeof (ip6_address_t));
+ memcpy (copy_target, addrp->as_u8, sizeof (ip6_address_t));
+ addrp++;
+ }
+
+ if (sm->is_debug)
+ clib_warning ("verify key index %d keybuf: %U", key_index,
+ format_hex_bytes, keybuf, vec_len(keybuf));
+
+ /* shared secret */
+
+ /* SHA1 is shorter than SHA-256 */
+ memset (signature, 0, vec_len(signature));
+
+ HMAC_CTX_init(sm->hmac_ctx);
+ if (!HMAC_Init(sm->hmac_ctx, hmac_key->shared_secret,
+ vec_len(hmac_key->shared_secret),sm->md))
+ clib_warning ("barf1");
+ if (!HMAC_Update(sm->hmac_ctx,keybuf,vec_len(keybuf)))
+ clib_warning ("barf2");
+ if (!HMAC_Final(sm->hmac_ctx,signature,&sig_len))
+ clib_warning ("barf3");
+ HMAC_CTX_cleanup(sm->hmac_ctx);
+
+ if (sm->is_debug)
+ clib_warning ("computed signature len %d, value %U", sig_len,
+ format_hex_bytes, signature, vec_len(signature));
+
+ /* Point at the SHA signature in the packet */
+ addrp++;
+ if (sm->is_debug)
+ clib_warning ("read signature %U", format_hex_bytes, addrp,
+ SHA256_DIGEST_LENGTH);
+
+ return memcmp (signature, addrp, SHA256_DIGEST_LENGTH);
+}
+
+static uword
+sr_local (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ ip6_sr_main_t * sm = &sr_main;
+ u32 (*sr_local_cb) (vlib_main_t *, vlib_node_runtime_t *,
+ vlib_buffer_t *, ip6_header_t *,
+ ip6_sr_header_t *);
+ sr_local_cb = sm->sr_local_cb;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ ip6_header_t * ip0, *ip1;
+ ip6_sr_header_t * sr0, *sr1;
+ ip6_address_t * new_dst0, * new_dst1;
+ u32 next0 = SR_LOCAL_NEXT_IP6_LOOKUP;
+ u32 next1 = SR_LOCAL_NEXT_IP6_LOOKUP;
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+ sr0 = (ip6_sr_header_t *)(ip0+1);
+
+ if (PREDICT_FALSE(sr0->type != ROUTING_HEADER_TYPE_SR))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_BAD_ROUTING_HEADER_TYPE];
+ goto do_trace0;
+ }
+
+ /* Out of segments? Turf the packet */
+ if (PREDICT_FALSE (sr0->segments_left == 0))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_NO_MORE_SEGMENTS];
+ goto do_trace0;
+ }
+
+ if (PREDICT_FALSE(sm->validate_hmac))
+ {
+ if (sr_validate_hmac (sm, ip0, sr0))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_HMAC_INVALID];
+ goto do_trace0;
+ }
+ }
+
+ next0 = sr_local_cb ? sr_local_cb (vm, node, b0, ip0, sr0) :
+ next0;
+
+ /*
+ * To suppress rewrite, return ~SR_LOCAL_NEXT_xxx
+ */
+ if (PREDICT_FALSE (next0 & 0x80000000))
+ {
+ next0 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next0 == SR_LOCAL_NEXT_ERROR))
+ b0->error =
+ node->errors[SR_LOCAL_ERROR_APP_CALLBACK];
+ }
+ else
+ {
+ u32 segment_index0;
+
+ segment_index0 = sr0->segments_left - 1;
+
+ /* Rewrite the packet */
+ new_dst0 = (ip6_address_t *)(sr0->segments + segment_index0);
+ ip0->dst_address.as_u64[0] = new_dst0->as_u64[0];
+ ip0->dst_address.as_u64[1] = new_dst0->as_u64[1];
+
+ if (PREDICT_TRUE (sr0->segments_left > 0))
+ {
+ sr0->segments_left -= 1;
+ goto do_trace0;
+ }
+ }
+
+ /* End of the path. Clean up the SR header, or not */
+ if (sr0->flags & clib_host_to_net_u16(IP6_SR_HEADER_FLAG_CLEANUP))
+ {
+ u64 *copy_dst0, *copy_src0;
+ u16 new_l0;
+ /*
+ * Copy the ip6 header right by the (real) length of the
+ * sr header. Here's another place which assumes that
+ * the sr header is the only extention header.
+ */
+
+ ip0->protocol = sr0->protocol;
+ vlib_buffer_advance (b0, (sr0->length+1)*8);
+
+ new_l0 = clib_net_to_host_u16(ip0->payload_length) -
+ (sr0->length+1)*8;
+ ip0->payload_length = clib_host_to_net_u16(new_l0);
+
+ copy_src0 = (u64 *)ip0;
+ copy_dst0 = copy_src0 + (sr0->length + 1);
+
+ copy_dst0 [4] = copy_src0[4];
+ copy_dst0 [3] = copy_src0[3];
+ copy_dst0 [2] = copy_src0[2];
+ copy_dst0 [1] = copy_src0[1];
+ copy_dst0 [0] = copy_src0[0];
+
+ sr0 = 0;
+ }
+
+ do_trace0:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_local_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ memcpy (tr->src.as_u8, ip0->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip0->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = vlib_buffer_length_in_chain (vm, b0);
+ tr->next_index = next0;
+ tr->sr_valid = sr0 != 0;
+ if (tr->sr_valid)
+ memcpy (tr->sr, sr0, sizeof (tr->sr));
+ }
+
+ b1 = vlib_get_buffer (vm, bi1);
+ ip1 = vlib_buffer_get_current (b1);
+ sr1 = (ip6_sr_header_t *)(ip1+1);
+
+ if (PREDICT_FALSE(sr1->type != ROUTING_HEADER_TYPE_SR))
+ {
+ next1 = SR_LOCAL_NEXT_ERROR;
+ b1->error = node->errors[SR_LOCAL_ERROR_BAD_ROUTING_HEADER_TYPE];
+ goto do_trace1;
+ }
+
+ /* Out of segments? Turf the packet */
+ if (PREDICT_FALSE (sr1->segments_left == 0))
+ {
+ next1 = SR_LOCAL_NEXT_ERROR;
+ b1->error = node->errors[SR_LOCAL_ERROR_NO_MORE_SEGMENTS];
+ goto do_trace1;
+ }
+
+ if (PREDICT_FALSE(sm->validate_hmac))
+ {
+ if (sr_validate_hmac (sm, ip1, sr1))
+ {
+ next1 = SR_LOCAL_NEXT_ERROR;
+ b1->error = node->errors[SR_LOCAL_ERROR_HMAC_INVALID];
+ goto do_trace1;
+ }
+ }
+
+ next1 = sr_local_cb ? sr_local_cb (vm, node, b1, ip1, sr1) :
+ next1;
+
+ /*
+ * To suppress rewrite, return ~SR_LOCAL_NEXT_xxx
+ */
+ if (PREDICT_FALSE (next1 & 0x80000000))
+ {
+ next1 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next1 == SR_LOCAL_NEXT_ERROR))
+ b1->error =
+ node->errors[SR_LOCAL_ERROR_APP_CALLBACK];
+ }
+ else
+ {
+ u32 segment_index1;
+
+ segment_index1 = sr1->segments_left - 1;
+
+ /* Rewrite the packet */
+ new_dst1 = (ip6_address_t *)(sr1->segments + segment_index1);
+ ip1->dst_address.as_u64[0] = new_dst1->as_u64[0];
+ ip1->dst_address.as_u64[1] = new_dst1->as_u64[1];
+
+ if (PREDICT_TRUE (sr1->segments_left > 0))
+ {
+ sr1->segments_left -= 1;
+ goto do_trace1;
+ }
+ }
+
+ /* End of the path. Clean up the SR header, or not */
+ if (sr1->flags & clib_host_to_net_u16(IP6_SR_HEADER_FLAG_CLEANUP))
+ {
+ u64 *copy_dst1, *copy_src1;
+ u16 new_l1;
+ /*
+ * Copy the ip6 header right by the (real) length of the
+ * sr header. Here's another place which assumes that
+ * the sr header is the only extention header.
+ */
+
+ ip1->protocol = sr1->protocol;
+ vlib_buffer_advance (b1, (sr1->length+1)*8);
+
+ new_l1 = clib_net_to_host_u16(ip1->payload_length) -
+ (sr1->length+1)*8;
+ ip1->payload_length = clib_host_to_net_u16(new_l1);
+
+ copy_src1 = (u64 *)ip1;
+ copy_dst1 = copy_src1 + (sr1->length + 1);
+
+ copy_dst1 [4] = copy_src1[4];
+ copy_dst1 [3] = copy_src1[3];
+ copy_dst1 [2] = copy_src1[2];
+ copy_dst1 [1] = copy_src1[1];
+ copy_dst1 [0] = copy_src1[0];
+
+ sr1 = 0;
+ }
+
+ do_trace1:
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_local_trace_t *tr = vlib_add_trace (vm, node,
+ b1, sizeof (*tr));
+ memcpy (tr->src.as_u8, ip1->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip1->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = vlib_buffer_length_in_chain (vm, b1);
+ tr->next_index = next1;
+ tr->sr_valid = sr1 != 0;
+ if (tr->sr_valid)
+ memcpy (tr->sr, sr1, sizeof (tr->sr));
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ ip6_header_t * ip0;
+ ip6_sr_header_t * sr0;
+ ip6_address_t * new_dst0;
+ u32 next0 = SR_LOCAL_NEXT_IP6_LOOKUP;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ ip0 = vlib_buffer_get_current (b0);
+ sr0 = (ip6_sr_header_t *)(ip0+1);
+
+ if (PREDICT_FALSE(sr0->type != ROUTING_HEADER_TYPE_SR))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_BAD_ROUTING_HEADER_TYPE];
+ goto do_trace;
+ }
+
+ /* Out of segments? Turf the packet */
+ if (PREDICT_FALSE (sr0->segments_left == 0))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_NO_MORE_SEGMENTS];
+ goto do_trace;
+ }
+
+ if (PREDICT_FALSE(sm->validate_hmac))
+ {
+ if (sr_validate_hmac (sm, ip0, sr0))
+ {
+ next0 = SR_LOCAL_NEXT_ERROR;
+ b0->error = node->errors[SR_LOCAL_ERROR_HMAC_INVALID];
+ goto do_trace;
+ }
+ }
+
+ next0 = sr_local_cb ? sr_local_cb (vm, node, b0, ip0, sr0) :
+ next0;
+
+ /*
+ * To suppress rewrite, return ~SR_LOCAL_NEXT_xxx
+ */
+ if (PREDICT_FALSE (next0 & 0x80000000))
+ {
+ next0 ^= 0xFFFFFFFF;
+ if (PREDICT_FALSE(next0 == SR_LOCAL_NEXT_ERROR))
+ b0->error =
+ node->errors[SR_LOCAL_ERROR_APP_CALLBACK];
+ }
+ else
+ {
+ u32 segment_index0;
+
+ segment_index0 = sr0->segments_left - 1;
+
+ /* Rewrite the packet */
+ new_dst0 = (ip6_address_t *)(sr0->segments + segment_index0);
+ ip0->dst_address.as_u64[0] = new_dst0->as_u64[0];
+ ip0->dst_address.as_u64[1] = new_dst0->as_u64[1];
+
+ if (PREDICT_TRUE (sr0->segments_left > 0))
+ {
+ sr0->segments_left -= 1;
+ goto do_trace;
+ }
+ }
+
+ /* End of the path. Clean up the SR header, or not */
+ if (sr0->flags & clib_host_to_net_u16(IP6_SR_HEADER_FLAG_CLEANUP))
+ {
+ u64 *copy_dst0, *copy_src0;
+ u16 new_l0;
+ /*
+ * Copy the ip6 header right by the (real) length of the
+ * sr header. Here's another place which assumes that
+ * the sr header is the only extention header.
+ */
+
+ ip0->protocol = sr0->protocol;
+ vlib_buffer_advance (b0, (sr0->length+1)*8);
+
+ new_l0 = clib_net_to_host_u16(ip0->payload_length) -
+ (sr0->length+1)*8;
+ ip0->payload_length = clib_host_to_net_u16(new_l0);
+
+ copy_src0 = (u64 *)ip0;
+ copy_dst0 = copy_src0 + (sr0->length + 1);
+
+ copy_dst0 [4] = copy_src0[4];
+ copy_dst0 [3] = copy_src0[3];
+ copy_dst0 [2] = copy_src0[2];
+ copy_dst0 [1] = copy_src0[1];
+ copy_dst0 [0] = copy_src0[0];
+
+ sr0 = 0;
+ }
+
+ do_trace:
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ sr_local_trace_t *tr = vlib_add_trace (vm, node,
+ b0, sizeof (*tr));
+ memcpy (tr->src.as_u8, ip0->src_address.as_u8,
+ sizeof (tr->src.as_u8));
+ memcpy (tr->dst.as_u8, ip0->dst_address.as_u8,
+ sizeof (tr->dst.as_u8));
+ tr->length = vlib_buffer_length_in_chain (vm, b0);
+ tr->next_index = next0;
+ tr->sr_valid = sr0 != 0;
+ if (tr->sr_valid)
+ memcpy (tr->sr, sr0, sizeof (tr->sr));
+ }
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ vlib_node_increment_counter (vm, sr_local_node.index,
+ SR_LOCAL_ERROR_PKTS_PROCESSED,
+ from_frame->n_vectors);
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (sr_local_node, static) = {
+ .function = sr_local,
+ .name = "sr-local",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+ .format_trace = format_sr_local_trace,
+
+ .runtime_data_bytes = 0,
+
+ .n_errors = SR_LOCAL_N_ERROR,
+ .error_strings = sr_local_error_strings,
+
+ .n_next_nodes = SR_LOCAL_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [SR_LOCAL_NEXT_##s] = n,
+ foreach_sr_local_next
+#undef _
+ },
+};
+
+ip6_sr_main_t * sr_get_main (vlib_main_t * vm)
+{
+ vlib_call_init_function (vm, sr_init);
+ ASSERT(sr_local_node.index);
+ return &sr_main;
+}
+
+
+static clib_error_t *
+set_ip6_sr_rewrite_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_address_t a;
+ ip6_main_t * im = &ip6_main;
+ ip_lookup_main_t * lm = &im->lookup_main;
+ u32 fib_index = 0;
+ u32 fib_id = 0;
+ u32 adj_index;
+ uword * p;
+ ip_adjacency_t * adj;
+ vnet_hw_interface_t * hi;
+ u32 sw_if_index;
+ ip6_sr_main_t * sm = &sr_main;
+ vnet_main_t * vnm = vnet_get_main();
+
+ if (!unformat (input, "%U", unformat_ip6_address, &a))
+ return clib_error_return (0, "ip6 address missing in '%U'",
+ format_unformat_error, input);
+
+ if (unformat (input, "rx-table-id %d", &fib_id))
+ {
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (p == 0)
+ return clib_error_return (0, "fib-id %d not found");
+ fib_index = p[0];
+ }
+
+ adj_index = ip6_fib_lookup_with_table (im, fib_index, &a);
+
+ if (adj_index == lm->miss_adj_index)
+ return clib_error_return (0, "no match for %U",
+ format_ip6_address, &a);
+
+ adj = ip_get_adjacency (lm, adj_index);
+
+ if (adj->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)
+ return clib_error_return (0, "%U unresolved (not a rewrite adj)",
+ format_ip6_address, &a);
+
+ adj->rewrite_header.next_index = sm->ip6_rewrite_sr_next_index;
+
+ sw_if_index = adj->rewrite_header.sw_if_index;
+ hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ adj->rewrite_header.node_index = sr_fix_dst_addr_node.index;
+
+ /* $$$$$ hack... steal the mcast group index */
+ adj->mcast_group_index =
+ vlib_node_add_next (vm, sr_fix_dst_addr_node.index, hi->output_node_index);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (set_ip6_sr_rewrite, static) = {
+ .path = "set ip6 sr rewrite",
+ .short_help = "set ip6 sr rewrite <ip6-address> [fib-id <id>]",
+ .function = set_ip6_sr_rewrite_fn,
+};
+
+void vnet_register_sr_app_callback (void *cb)
+{
+ ip6_sr_main_t * sm = &sr_main;
+
+ sm->sr_local_cb = cb;
+}
+
+static clib_error_t *
+test_sr_hmac_validate_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_sr_main_t * sm = &sr_main;
+
+ if (unformat (input, "validate on"))
+ sm->validate_hmac = 1;
+ else if (unformat (input, "chunk-offset off"))
+ sm->validate_hmac = 0;
+ else
+ return clib_error_return (0, "expected validate on|off in '%U'",
+ format_unformat_error, input);
+
+ vlib_cli_output (vm, "hmac signature validation %s",
+ sm->validate_hmac ?
+ "on" : "off");
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_sr_hmac_validate, static) = {
+ .path = "test sr hmac",
+ .short_help = "test sr hmac validate [on|off]",
+ .function = test_sr_hmac_validate_fn,
+};
+
+i32 sr_hmac_add_del_key (ip6_sr_main_t * sm, u32 key_id, u8 * shared_secret,
+ u8 is_del)
+{
+ u32 index;
+ ip6_sr_hmac_key_t * key;
+
+ if (is_del == 0)
+ {
+ /* Specific key in use? Fail. */
+ if (key_id && vec_len (sm->hmac_keys) > key_id
+ && sm->hmac_keys[key_id].shared_secret)
+ return -2;
+
+ index = key_id;
+ key = find_or_add_shared_secret (sm, shared_secret, &index);
+ ASSERT(index == key_id);
+ return 0;
+ }
+
+ /* delete */
+
+ if (key_id) /* delete by key ID */
+ {
+ if (vec_len (sm->hmac_keys) <= key_id)
+ return -3;
+
+ key = sm->hmac_keys + key_id;
+
+ hash_unset_mem (sm->hmac_key_by_shared_secret, key->shared_secret);
+ vec_free (key->shared_secret);
+ return 0;
+ }
+
+ index = 0;
+ key = find_or_add_shared_secret (sm, shared_secret, &index);
+ hash_unset_mem (sm->hmac_key_by_shared_secret, key->shared_secret);
+ vec_free (key->shared_secret);
+ return 0;
+}
+
+
+static clib_error_t *
+sr_hmac_add_del_key_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_sr_main_t * sm = &sr_main;
+ u8 is_del = 0;
+ u32 key_id = 0;
+ u8 key_id_set = 0;
+ u8 * shared_secret = 0;
+ i32 rv;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "del"))
+ is_del = 1;
+ else if (unformat (input, "id %d", &key_id))
+ key_id_set = 1;
+ else if (unformat (input, "key %s", &shared_secret))
+ {
+ /* Do not include the trailing NULL byte. Guaranteed interop issue */
+ _vec_len (shared_secret) -= 1;
+ }
+ else
+ break;
+ }
+
+ if (is_del == 0 && shared_secret == 0)
+ return clib_error_return (0, "shared secret must be set to add a key");
+
+ if (shared_secret == 0 && key_id_set == 0)
+ return clib_error_return (0, "shared secret and key id both unset");
+
+ rv = sr_hmac_add_del_key (sm, key_id, shared_secret, is_del);
+
+ vec_free (shared_secret);
+
+ switch (rv)
+ {
+ case 0:
+ break;
+
+ default:
+ return clib_error_return (0, "sr_hmac_add_del_key returned %d",
+ rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (sr_hmac, static) = {
+ .path = "sr hmac",
+ .short_help = "sr hmac [del] id <nn> key <str>",
+ .function = sr_hmac_add_del_key_fn,
+};
+
+
+static clib_error_t *
+show_sr_hmac_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_sr_main_t * sm = &sr_main;
+ int i;
+
+ for (i = 1; i < vec_len (sm->hmac_keys); i++)
+ {
+ if (sm->hmac_keys[i].shared_secret)
+ vlib_cli_output (vm, "[%d]: %v", i, sm->hmac_keys[i].shared_secret);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_sr_hmac, static) = {
+ .path = "show sr hmac",
+ .short_help = "show sr hmac",
+ .function = show_sr_hmac_fn,
+};
+
+static clib_error_t *
+test_sr_debug_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ ip6_sr_main_t * sm = &sr_main;
+
+ if (unformat (input, "on"))
+ sm->is_debug = 1;
+ else if (unformat (input, "off"))
+ sm->is_debug = 0;
+ else
+ return clib_error_return (0, "expected on|off in '%U'",
+ format_unformat_error, input);
+
+ vlib_cli_output (vm, "debug trace now %s", sm->is_debug ? "on" : "off");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (test_sr_debug, static) = {
+ .path = "test sr debug",
+ .short_help = "test sr debug on|off",
+ .function = test_sr_debug_fn,
+};
diff --git a/vnet/vnet/sr/sr.h b/vnet/vnet/sr/sr.h
new file mode 100644
index 00000000000..3c6d981d17e
--- /dev/null
+++ b/vnet/vnet/sr/sr.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_sr_h
+#define included_vnet_sr_h
+
+#include <vnet/vnet.h>
+#include <vnet/sr/sr_packet.h>
+#include <vnet/ip/ip6_packet.h>
+
+#include <openssl/opensslconf.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/opensslv.h>
+#include <openssl/hmac.h>
+
+typedef struct {
+ ip6_address_t src;
+ ip6_address_t dst;
+} ip6_sr_tunnel_key_t;
+
+typedef struct {
+ /* src, dst address */
+ ip6_sr_tunnel_key_t key;
+
+ /* mask width for FIB entry */
+ u32 dst_mask_width;
+
+ /* first hop, to save 1 elt in the segment list */
+ ip6_address_t first_hop;
+
+ /* Fib indices */
+ u32 rx_fib_index;
+ u32 tx_fib_index;
+
+ /* The actual ip6 sr header */
+ u8 * rewrite;
+} ip6_sr_tunnel_t;
+
+typedef struct {
+ u8 * shared_secret;
+} ip6_sr_hmac_key_t;
+
+typedef struct {
+ /* Key (header imposition case) */
+ ip6_address_t *src_address;
+ ip6_address_t *dst_address;
+ u32 dst_mask_width;
+ u32 rx_table_id;
+ u32 tx_table_id;
+
+ /* segment list, when inserting an ip6 SR header*/
+ ip6_address_t *segments;
+
+ /*
+ * "Tag" list, aka segments inserted at the end of the list,
+ * past last_seg
+ */
+ ip6_address_t *tags;
+
+ /* Shared secret => generate SHA-256 HMAC security fields */
+ u8 * shared_secret;
+
+ /* Flags, e.g. cleanup, policy-list flags */
+ u16 flags_net_byte_order;
+
+ /* Delete the tunnnel? */
+ u8 is_del;
+} ip6_sr_add_del_tunnel_args_t;
+
+typedef struct {
+ /* pool of tunnel instances, sr entry only */
+ ip6_sr_tunnel_t *tunnels;
+
+ /* find an sr "tunnel" by its outer-IP src/dst */
+ uword * tunnel_index_by_key;
+
+ /* ip6-lookup next index for imposition FIB entries */
+ u32 ip6_lookup_sr_next_index;
+
+ /* hmac key id by shared secret */
+ uword * hmac_key_by_shared_secret;
+
+ /* ip6-rewrite next index for reinstalling the original dst address */
+ u32 ip6_rewrite_sr_next_index;
+
+ /* application API callback */
+ void *sr_local_cb;
+
+ /* validate hmac keys */
+ u8 validate_hmac;
+
+ /* pool of hmac keys */
+ ip6_sr_hmac_key_t * hmac_keys;
+
+ /* Openssl vbls */
+ EVP_MD * md;
+ HMAC_CTX * hmac_ctx;
+
+ /* enable debug spew */
+ u8 is_debug;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} ip6_sr_main_t;
+
+ip6_sr_main_t sr_main;
+
+format_function_t format_ip6_sr_header;
+format_function_t format_ip6_sr_header_with_length;
+
+vlib_node_registration_t ip6_sr_input_node;
+
+int ip6_sr_add_del_tunnel (ip6_sr_add_del_tunnel_args_t * a);
+void vnet_register_sr_app_callback (void *cb);
+
+#endif /* included_vnet_sr_h */
diff --git a/vnet/vnet/sr/sr_error.def b/vnet/vnet/sr/sr_error.def
new file mode 100644
index 00000000000..62d021fd47b
--- /dev/null
+++ b/vnet/vnet/sr/sr_error.def
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+sr_error (NONE, "no error")
+sr_error (BAD_ROUTING_HEADER_TYPE, "bad routing header type (not 4)")
+sr_error (NO_MORE_SEGMENTS, "out of SR segment drops")
+sr_error (PKTS_PROCESSED, "SR packets processed")
+sr_error (APP_CALLBACK, "SR application callback errors")
+sr_error (HMAC_INVALID, "SR packets with invalid HMAC signatures")
diff --git a/vnet/vnet/sr/sr_fix_dst_error.def b/vnet/vnet/sr/sr_fix_dst_error.def
new file mode 100644
index 00000000000..48fe7af6c98
--- /dev/null
+++ b/vnet/vnet/sr/sr_fix_dst_error.def
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+sr_fix_dst_error (NONE, "no error")
+sr_fix_dst_error (NO_SR_HEADER, "no SR header present")
+sr_fix_dst_error (NO_MORE_SEGMENTS, "no more SR segments")
diff --git a/vnet/vnet/sr/sr_packet.h b/vnet/vnet/sr/sr_packet.h
new file mode 100644
index 00000000000..5604a8dadb7
--- /dev/null
+++ b/vnet/vnet/sr/sr_packet.h
@@ -0,0 +1,227 @@
+#ifndef included_vnet_sr_packet_h
+#define included_vnet_sr_packet_h
+
+#include <vnet/ip/ip.h>
+
+/*
+ * ipv6 segment-routing header format
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The Segment Routing Header (SRH) is defined as follows:
+ *
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Next Header | Hdr Ext Len | Routing Type | Segments Left |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | First Segment | Flags | HMAC Key ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | Segment List[0] (128 bits ipv6 address) |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | |
+ * ...
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | Segment List[n] (128 bits ipv6 address) |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | Policy List[0] (optional) |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | Policy List[1] (optional) |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | Policy List[2] (optional) |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | |
+ * | |
+ * | HMAC (256 bits) |
+ * | (optional) |
+ * | |
+ * | |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * where:
+ *
+ * o Next Header: 8-bit selector. Identifies the type of header
+ * immediately following the SRH.
+ *
+ * o Hdr Ext Len: 8-bit unsigned integer, is the length of the SRH
+ * header in 8-octet units, not including the first 8 octets.
+ *
+ * o Routing Type: TBD, to be assigned by IANA (suggested value: 4).
+ *
+ * o Segments Left. Defined in [RFC2460], it contains the index, in
+ * the Segment List, of the next segment to inspect. Segments Left
+ * is decremented at each segment and it is used as an index in the
+ * segment list.
+ *
+ * o First Segment: offset in the SRH, not including the first 8 octets
+ * and expressed in 16-octet units, pointing to the last element of
+ * the segment list, which is in fact the first segment of the
+ * segment routing path.
+ *
+ * o Flags: 16 bits of flags. Following flags are defined:
+ *
+ * 1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |C|P|R|R| Policy Flags |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * C-flag: Clean-up flag. Set when the SRH has to be removed from
+ * the packet when packet reaches the last segment.
+ *
+ * P-flag: Protected flag. Set when the packet has been rerouted
+ * through FRR mechanism by a SR endpoint node. See Section 6.3
+ * for more details.
+ *
+ * R-flags. Reserved and for future use.
+ *
+ * Policy Flags. Define the type of the IPv6 addresses encoded
+ * into the Policy List (see below). The following have been
+ * defined:
+ *
+ * Bits 4-6: determine the type of the first element after the
+ * segment list.
+ *
+ * Bits 7-9: determine the type of the second element.
+ *
+ * Bits 10-12: determine the type of the third element.
+ *
+ * Bits 13-15: determine the type of the fourth element.
+ *
+ * The following values are used for the type:
+ *
+ * 0x0: Not present. If value is set to 0x0, it means the
+ * element represented by these bits is not present.
+ *
+ * 0x1: SR Ingress.
+ *
+ * 0x2: SR Egress.
+ *
+ * 0x3: Original Source Address.
+ *
+ * o HMAC Key ID and HMAC field, and their use are defined in
+ * [I-D.vyncke-6man-segment-routing-security].
+ *
+ * o Segment List[n]: 128 bit IPv6 addresses representing the nth
+ * segment in the Segment List. The Segment List is encoded starting
+ * from the last segment of the path. I.e., the first element of the
+ * segment list (Segment List [0]) contains the last segment of the
+ * path while the last segment of the Segment List (Segment List[n])
+ * contains the first segment of the path. The index contained in
+ * "Segments Left" identifies the current active segment.
+ *
+ * o Policy List. Optional addresses representing specific nodes in
+ * the SR path such as:
+ *
+ * SR Ingress: a 128 bit generic identifier representing the
+ * ingress in the SR domain (i.e.: it needs not to be a valid IPv6
+ * address).
+ *
+ * SR Egress: a 128 bit generic identifier representing the egress
+ * in the SR domain (i.e.: it needs not to be a valid IPv6
+ * address).
+ *
+ * Original Source Address: IPv6 address originally present in the
+ * SA field of the packet.
+ *
+ * The segments in the Policy List are encoded after the segment list
+ * and they are optional. If none are in the SRH, all bits of the
+ * Policy List Flags MUST be set to 0x0.
+ */
+
+#define ROUTING_HEADER_TYPE_SR 4
+
+typedef struct {
+ /* Protocol for next header. */
+ u8 protocol;
+ /*
+ * Length of routing header in 8 octet units,
+ * not including the first 8 octets
+ */
+ u8 length;
+
+ /* Type of routing header; type 4 = segement routing */
+ u8 type;
+
+ /* Next segment in the segment list */
+ u8 segments_left;
+
+ /*
+ * Policy list pointer: offset in the SRH of the policy
+ * list - in 16-octet units - not including the first 8 octets.
+ */
+ u8 first_segment;
+
+ /* Flag bits */
+#define IP6_SR_HEADER_FLAG_CLEANUP (0x8000)
+#define IP6_SR_HEADER_FLAG_PROTECTED (0x4000)
+#define IP6_SR_HEADER_FLAG_RESERVED (0x3000)
+
+#define IP6_SR_HEADER_FLAG_PL_ELT_NOT_PRESENT (0x0)
+#define IP6_SR_HEADER_FLAG_PL_ELT_INGRESS_PE (0x1)
+#define IP6_SR_HEADER_FLAG_PL_ELT_EGRESS_PE (0x2)
+#define IP6_SR_HEADER_FLAG_PL_ELT_ORIG_SRC_ADDR (0x3)
+ /* values 0x4 - 0x7 are reserved */
+ u16 flags;
+ u8 hmac_key;
+
+ /* The segment + policy list elts */
+ ip6_address_t segments[0];
+} __attribute__((packed)) ip6_sr_header_t;
+
+static inline int
+ip6_sr_policy_list_shift_from_index (int pl_index)
+{
+ return (-3 * pl_index) + 12;
+}
+
+/* pl_index is one-origined, to match the text above */
+static inline int
+ip6_sr_policy_list_flags (u16 flags_host_byte_order, int pl_index)
+{
+ int shift;
+
+ if (pl_index <= 0 || pl_index > 4)
+ return 0;
+
+ shift = (-3 * pl_index) + 12;
+ flags_host_byte_order >>= shift;
+
+ return (flags_host_byte_order & 7);
+}
+
+#endif /* included_vnet_sr_packet_h */
diff --git a/vnet/vnet/srp/format.c b/vnet/vnet/srp/format.c
new file mode 100644
index 00000000000..a0250cc976f
--- /dev/null
+++ b/vnet/vnet/srp/format.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * srp_format.c: srp formatting/parsing.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/srp/srp.h>
+#include <vnet/ethernet/ethernet.h>
+
+static u8 * format_srp_mode (u8 * s, va_list * args)
+{
+ u32 mode = va_arg (*args, u32);
+ char * t = 0;
+ switch (mode)
+ {
+#define _(f) case SRP_MODE_##f: t = #f; break;
+ foreach_srp_mode
+#undef _
+ default: t = 0; break;
+ }
+ if (t)
+ s = format (s, "%s", t);
+ else
+ s = format (s, "unknown 0x%x", mode);
+
+ return s;
+}
+
+u8 * format_srp_header_with_length (u8 * s, va_list * args)
+{
+ srp_and_ethernet_header_t * h = va_arg (*args, srp_and_ethernet_header_t *);
+ u32 max_header_bytes = va_arg (*args, u32);
+ ethernet_main_t * em = &ethernet_main;
+ uword indent, header_bytes;
+
+ header_bytes = sizeof (h[0]);
+ if (max_header_bytes != 0 && header_bytes > max_header_bytes)
+ return format (s, "srp header truncated");
+
+ indent = format_get_indent (s);
+
+ s = format (s, "mode %U, ring %s, priority %d, ttl %d",
+ format_srp_mode, h->srp.mode,
+ h->srp.is_inner_ring ? "inner" : "outer",
+ h->srp.priority, h->srp.ttl);
+
+ s = format (s, "\n%U%U: %U -> %U",
+ format_white_space, indent,
+ format_ethernet_type, clib_net_to_host_u16 (h->ethernet.type),
+ format_ethernet_address, h->ethernet.src_address,
+ format_ethernet_address, h->ethernet.dst_address);
+
+ if (max_header_bytes != 0 && header_bytes < max_header_bytes)
+ {
+ ethernet_type_info_t * ti;
+ vlib_node_t * node;
+
+ ti = ethernet_get_type_info (em, h->ethernet.type);
+ node = ti ? vlib_get_node (em->vlib_main, ti->node_index) : 0;
+ if (node && node->format_buffer)
+ s = format (s, "\n%U%U",
+ format_white_space, indent,
+ node->format_buffer, (void *) h + header_bytes,
+ max_header_bytes - header_bytes);
+ }
+
+ return s;
+}
+
+u8 * format_srp_header (u8 * s, va_list * args)
+{
+ srp_header_t * m = va_arg (*args, srp_header_t *);
+ return format (s, "%U", format_srp_header_with_length, m, 0);
+}
+
+uword
+unformat_srp_header (unformat_input_t * input, va_list * args)
+{
+ u8 ** result = va_arg (*args, u8 **);
+ srp_and_ethernet_header_t * h;
+
+ {
+ void * p;
+ vec_add2 (*result, p, sizeof (h[0]));
+ h = p;
+ }
+
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_ethernet_type_net_byte_order, &h->ethernet.type,
+ unformat_ethernet_address, &h->ethernet.src_address,
+ unformat_ethernet_address, &h->ethernet.dst_address))
+ return 0;
+
+ h->srp.mode = SRP_MODE_data;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ u32 x;
+
+ if (unformat (input, "control"))
+ h->srp.mode = SRP_MODE_control_pass_to_host;
+
+ else if (unformat (input, "pri %d", &x))
+ h->srp.priority = x;
+
+ else if (unformat (input, "ttl %d", &x))
+ h->srp.ttl = x;
+
+ else
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/vnet/vnet/srp/interface.c b/vnet/vnet/srp/interface.c
new file mode 100644
index 00000000000..4c14b747751
--- /dev/null
+++ b/vnet/vnet/srp/interface.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * srp_interface.c: srp interfaces
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vnet/srp/srp.h>
+
+static uword srp_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ srp_main_t * sm = &srp_main;
+ srp_and_ethernet_header_t * h = rewrite;
+ u16 type;
+ uword n_bytes = sizeof (h[0]);
+
+ if (n_bytes > max_rewrite_bytes)
+ return 0;
+
+ switch (l3_type) {
+#define _(a,b) case VNET_L3_PACKET_TYPE_##a: type = ETHERNET_TYPE_##b; break
+ _ (IP4, IP4);
+ _ (IP6, IP6);
+ _ (MPLS_UNICAST, MPLS_UNICAST);
+ _ (MPLS_MULTICAST, MPLS_MULTICAST);
+ _ (ARP, ARP);
+#undef _
+ default:
+ return 0;
+ }
+
+ memcpy (h->ethernet.src_address, hw->hw_address, sizeof (h->ethernet.src_address));
+ if (dst_address)
+ memcpy (h->ethernet.dst_address, dst_address, sizeof (h->ethernet.dst_address));
+ else
+ memset (h->ethernet.dst_address, ~0, sizeof (h->ethernet.dst_address)); /* broadcast */
+
+ h->ethernet.type = clib_host_to_net_u16 (type);
+
+ h->srp.as_u16 = 0;
+ h->srp.mode = SRP_MODE_data;
+ h->srp.ttl = sm->default_data_ttl;
+ srp_header_compute_parity (&h->srp);
+
+ return n_bytes;
+}
+
+static void srp_register_interface_helper (u32 * hw_if_indices_by_side, u32 redistribute);
+
+void serialize_srp_main (serialize_main_t * m, va_list * va)
+{
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si;
+
+ serialize_integer (m, pool_elts (sm->interface_pool), sizeof (u32));
+ pool_foreach (si, sm->interface_pool, ({
+ serialize_integer (m, si->rings[SRP_RING_OUTER].hw_if_index, sizeof (u32));
+ serialize_integer (m, si->rings[SRP_RING_INNER].hw_if_index, sizeof (u32));
+ }));
+}
+
+void unserialize_srp_main (serialize_main_t * m, va_list * va)
+{
+ u32 i, n_ifs, hw_if_indices[SRP_N_RING];
+
+ unserialize_integer (m, &n_ifs, sizeof (u32));
+ for (i = 0; i < n_ifs; i++)
+ {
+ unserialize_integer (m, &hw_if_indices[SRP_RING_OUTER], sizeof (u32));
+ unserialize_integer (m, &hw_if_indices[SRP_RING_INNER], sizeof (u32));
+ srp_register_interface_helper (hw_if_indices, /* redistribute */ 0);
+ }
+}
+
+static void serialize_srp_register_interface_msg (serialize_main_t * m, va_list * va)
+{
+ u32 * hw_if_indices = va_arg (*va, u32 *);
+ serialize_integer (m, hw_if_indices[SRP_SIDE_A], sizeof (hw_if_indices[SRP_SIDE_A]));
+ serialize_integer (m, hw_if_indices[SRP_SIDE_B], sizeof (hw_if_indices[SRP_SIDE_B]));
+}
+
+static void unserialize_srp_register_interface_msg (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mcm) = va_arg (*va, mc_main_t *);
+ u32 hw_if_indices[SRP_N_SIDE];
+ srp_main_t * sm = &srp_main;
+ uword * p;
+
+ unserialize_integer (m, &hw_if_indices[SRP_SIDE_A], sizeof (hw_if_indices[SRP_SIDE_A]));
+ unserialize_integer (m, &hw_if_indices[SRP_SIDE_B], sizeof (hw_if_indices[SRP_SIDE_B]));
+
+ p = hash_get (sm->srp_register_interface_waiting_process_pool_index_by_hw_if_index,
+ hw_if_indices[0]);
+ if (p)
+ {
+ vlib_one_time_waiting_process_t * wp = pool_elt_at_index (sm->srp_register_interface_waiting_process_pool, p[0]);
+ vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+ pool_put (sm->srp_register_interface_waiting_process_pool, wp);
+ hash_unset (sm->srp_register_interface_waiting_process_pool_index_by_hw_if_index,
+ hw_if_indices[0]);
+ }
+ else
+ srp_register_interface_helper (hw_if_indices, /* redistribute */ 0);
+}
+
+MC_SERIALIZE_MSG (srp_register_interface_msg, static) = {
+ .name = "vnet_srp_register_interface",
+ .serialize = serialize_srp_register_interface_msg,
+ .unserialize = unserialize_srp_register_interface_msg,
+};
+
+static void srp_register_interface_helper (u32 * hw_if_indices_by_side, u32 redistribute)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ srp_main_t * sm = &srp_main;
+ vlib_main_t * vm = sm->vlib_main;
+ srp_interface_t * si;
+ vnet_hw_interface_t * hws[SRP_N_RING];
+ uword s, * p;
+
+ if (vm->mc_main && redistribute)
+ {
+ vlib_one_time_waiting_process_t * wp;
+ mc_serialize (vm->mc_main, &srp_register_interface_msg, hw_if_indices_by_side);
+ pool_get (sm->srp_register_interface_waiting_process_pool, wp);
+ hash_set (sm->srp_register_interface_waiting_process_pool_index_by_hw_if_index,
+ hw_if_indices_by_side[0],
+ wp - sm->srp_register_interface_waiting_process_pool);
+ vlib_current_process_wait_for_one_time_event (vm, wp);
+ }
+
+ /* Check if interface has already been registered. */
+ p = hash_get (sm->interface_index_by_hw_if_index, hw_if_indices_by_side[0]);
+ if (p)
+ {
+ si = pool_elt_at_index (sm->interface_pool, p[0]);
+ }
+ else
+ {
+ pool_get (sm->interface_pool, si);
+ memset (si, 0, sizeof (si[0]));
+ }
+ for (s = 0; s < SRP_N_SIDE; s++)
+ {
+ hws[s] = vnet_get_hw_interface (vnm, hw_if_indices_by_side[s]);
+ si->rings[s].ring = s;
+ si->rings[s].hw_if_index = hw_if_indices_by_side[s];
+ si->rings[s].sw_if_index = hws[s]->sw_if_index;
+ hash_set (sm->interface_index_by_hw_if_index, hw_if_indices_by_side[s], si - sm->interface_pool);
+ }
+
+ /* Inherit MAC address from outer ring. */
+ memcpy (si->my_address, hws[SRP_RING_OUTER]->hw_address,
+ vec_len (hws[SRP_RING_OUTER]->hw_address));
+
+ /* Default time to wait to restore signal. */
+ si->config.wait_to_restore_idle_delay = 60;
+ si->config.ips_tx_interval = 1;
+}
+
+void srp_register_interface (u32 * hw_if_indices_by_side)
+{
+ srp_register_interface_helper (hw_if_indices_by_side, /* redistribute */ 1);
+}
+
+void srp_interface_set_hw_wrap_function (u32 hw_if_index, srp_hw_wrap_function_t * f)
+{
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+ si->hw_wrap_function = f;
+}
+
+void srp_interface_set_hw_enable_function (u32 hw_if_index, srp_hw_enable_function_t * f)
+{
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+ si->hw_enable_function = f;
+}
+
+void srp_interface_enable_ips (u32 hw_if_index)
+{
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+
+ si->ips_process_enable = 1;
+
+ vlib_node_set_state (sm->vlib_main, srp_ips_process_node.index, VLIB_NODE_STATE_POLLING);
+}
+
+static uword
+srp_is_valid_class_for_interface (vnet_main_t * vnm, u32 hw_if_index, u32 hw_class_index)
+{
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+
+ if (! si)
+ return 0;
+
+ /* Both sides must be admin down. */
+ if (vnet_sw_interface_is_admin_up (vnm, si->rings[SRP_RING_OUTER].sw_if_index))
+ return 0;
+ if (vnet_sw_interface_is_admin_up (vnm, si->rings[SRP_RING_INNER].sw_if_index))
+ return 0;
+
+ return 1;
+}
+
+static void
+srp_interface_hw_class_change (vnet_main_t * vnm, u32 hw_if_index,
+ u32 old_hw_class_index, u32 new_hw_class_index)
+{
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+ vnet_hw_interface_t * hi;
+ vnet_device_class_t * dc;
+ u32 r, to_srp;
+
+ if (!si) {
+ clib_warning ("srp interface no set si = 0");
+ return;
+ }
+
+ to_srp = new_hw_class_index == srp_hw_interface_class.index;
+
+ /* Changing class on either outer or inner rings implies changing the class
+ of the other. */
+ for (r = 0; r < SRP_N_RING; r++)
+ {
+ srp_interface_ring_t * ir = &si->rings[r];
+
+ hi = vnet_get_hw_interface (vnm, ir->hw_if_index);
+ dc = vnet_get_device_class (vnm, hi->dev_class_index);
+
+ /* hw_if_index itself will be handled by caller. */
+ if (ir->hw_if_index != hw_if_index)
+ {
+ vnet_hw_interface_init_for_class (vnm, ir->hw_if_index,
+ new_hw_class_index,
+ to_srp ? si - sm->interface_pool : ~0);
+
+ if (dc->hw_class_change)
+ dc->hw_class_change (vnm, ir->hw_if_index, new_hw_class_index);
+ }
+ else
+ hi->hw_instance = to_srp ? si - sm->interface_pool : ~0;
+ }
+
+ if (si->hw_enable_function)
+ si->hw_enable_function (si, /* enable */ to_srp);
+}
+
+VNET_HW_INTERFACE_CLASS (srp_hw_interface_class) = {
+ .name = "SRP",
+ .format_address = format_ethernet_address,
+ .format_header = format_srp_header_with_length,
+ .format_device = format_srp_device,
+ .unformat_hw_address = unformat_ethernet_address,
+ .unformat_header = unformat_srp_header,
+ .set_rewrite = srp_set_rewrite,
+ .is_valid_class_for_interface = srp_is_valid_class_for_interface,
+ .hw_class_change = srp_interface_hw_class_change,
+};
+
+static void serialize_srp_interface_config_msg (serialize_main_t * m, va_list * va)
+{
+ srp_interface_t * si = va_arg (*va, srp_interface_t *);
+ srp_main_t * sm = &srp_main;
+
+ ASSERT (! pool_is_free (sm->interface_pool, si));
+ serialize_integer (m, si - sm->interface_pool, sizeof (u32));
+ serialize (m, serialize_f64, si->config.wait_to_restore_idle_delay);
+ serialize (m, serialize_f64, si->config.ips_tx_interval);
+}
+
+static void unserialize_srp_interface_config_msg (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mcm) = va_arg (*va, mc_main_t *);
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si;
+ u32 si_index;
+
+ unserialize_integer (m, &si_index, sizeof (u32));
+ si = pool_elt_at_index (sm->interface_pool, si_index);
+ unserialize (m, unserialize_f64, &si->config.wait_to_restore_idle_delay);
+ unserialize (m, unserialize_f64, &si->config.ips_tx_interval);
+}
+
+MC_SERIALIZE_MSG (srp_interface_config_msg, static) = {
+ .name = "vnet_srp_interface_config",
+ .serialize = serialize_srp_interface_config_msg,
+ .unserialize = unserialize_srp_interface_config_msg,
+};
+
+void srp_interface_get_interface_config (u32 hw_if_index, srp_interface_config_t * c)
+{
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+ ASSERT (si != 0);
+ c[0] = si->config;
+}
+
+void srp_interface_set_interface_config (u32 hw_if_index, srp_interface_config_t * c)
+{
+ srp_main_t * sm = &srp_main;
+ vlib_main_t * vm = sm->vlib_main;
+ srp_interface_t * si = srp_get_interface_from_vnet_hw_interface (hw_if_index);
+ ASSERT (si != 0);
+ if (memcmp (&si->config, &c[0], sizeof (c[0])))
+ {
+ si->config = c[0];
+ if (vm->mc_main)
+ mc_serialize (vm->mc_main, &srp_interface_config_msg, si);
+ }
+}
+
+#if DEBUG > 0
+
+#define VNET_SIMULATED_SRP_TX_NEXT_SRP_INPUT VNET_INTERFACE_TX_N_NEXT
+
+/* Echo packets back to srp input. */
+static uword
+simulated_srp_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, n_left_to_next, n_copy, * from, * to_next;
+ u32 next_index = VNET_SIMULATED_SRP_TX_NEXT_SRP_INPUT;
+ u32 i;
+ vlib_buffer_t * b;
+
+ n_left_from = frame->n_vectors;
+ from = vlib_frame_args (frame);
+
+ while (n_left_from > 0)
+ {
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ n_copy = clib_min (n_left_from, n_left_to_next);
+
+ memcpy (to_next, from, n_copy * sizeof (from[0]));
+ n_left_to_next -= n_copy;
+ n_left_from -= n_copy;
+ for (i = 0; i < n_copy; i++)
+ {
+ b = vlib_get_buffer (vm, from[i]);
+ /* TX interface will be fake eth; copy to RX for benefit of srp-input. */
+ b->sw_if_index[VLIB_RX] = b->sw_if_index[VLIB_TX];
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return n_left_from;
+}
+
+static u8 * format_simulated_srp_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "fake-srp%d", dev_instance);
+}
+
+VNET_DEVICE_CLASS (srp_simulated_device_class,static) = {
+ .name = "Simulated srp",
+ .format_device_name = format_simulated_srp_name,
+ .tx_function = simulated_srp_interface_tx,
+};
+
+static clib_error_t *
+create_simulated_srp_interfaces (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ u8 address[6];
+ u32 hw_if_index;
+ vnet_hw_interface_t * hi;
+ static u32 instance;
+
+ if (! unformat_user (input, unformat_ethernet_address, &address))
+ {
+ memset (address, 0, sizeof (address));
+ address[0] = 0xde;
+ address[1] = 0xad;
+ address[5] = instance;
+ }
+
+ hw_if_index = vnet_register_interface (vnm,
+ srp_simulated_device_class.index,
+ instance++,
+ srp_hw_interface_class.index, 0);
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+
+ srp_setup_node (vm, hi->output_node_index);
+
+ hi->min_packet_bytes = 40 + 16;
+
+ /* Standard default ethernet MTU. */
+ hi->max_l3_packet_bytes[VLIB_RX] = hi->max_l3_packet_bytes[VLIB_TX] = 1500;
+
+ vec_free (hi->hw_address);
+ vec_add (hi->hw_address, address, sizeof (address));
+
+ {
+ uword slot;
+
+ slot = vlib_node_add_named_next_with_slot
+ (vm, hi->tx_node_index,
+ "srp-input",
+ VNET_SIMULATED_SRP_TX_NEXT_SRP_INPUT);
+ ASSERT (slot == VNET_SIMULATED_SRP_TX_NEXT_SRP_INPUT);
+ }
+
+ return /* no error */ 0;
+}
+
+static VLIB_CLI_COMMAND (create_simulated_srp_interface_command) = {
+ .path = "srp create-interfaces",
+ .short_help = "Create simulated srp interface",
+ .function = create_simulated_srp_interfaces,
+};
+#endif
diff --git a/vnet/vnet/srp/node.c b/vnet/vnet/srp/node.c
new file mode 100644
index 00000000000..a44f108326e
--- /dev/null
+++ b/vnet/vnet/srp/node.c
@@ -0,0 +1,929 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node.c: srp packet processing
+ *
+ * Copyright (c) 2011 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/ip/ip_packet.h> /* for ip_csum_fold */
+#include <vnet/srp/srp.h>
+
+typedef struct {
+ u8 packet_data[32];
+} srp_input_trace_t;
+
+static u8 * format_srp_input_trace (u8 * s, va_list * va)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *);
+ srp_input_trace_t * t = va_arg (*va, srp_input_trace_t *);
+
+ s = format (s, "%U", format_srp_header, t->packet_data);
+
+ return s;
+}
+
+typedef enum {
+ SRP_INPUT_NEXT_ERROR,
+ SRP_INPUT_NEXT_ETHERNET_INPUT,
+ SRP_INPUT_NEXT_CONTROL,
+ SRP_INPUT_N_NEXT,
+} srp_input_next_t;
+
+typedef struct {
+ u8 next_index;
+ u8 buffer_advance;
+ u16 error;
+} srp_input_disposition_t;
+
+static srp_input_disposition_t srp_input_disposition_by_mode[8] = {
+ [SRP_MODE_reserved0] = {
+ .next_index = SRP_INPUT_NEXT_ERROR,
+ .error = SRP_ERROR_UNKNOWN_MODE,
+ },
+ [SRP_MODE_reserved1] = {
+ .next_index = SRP_INPUT_NEXT_ERROR,
+ .error = SRP_ERROR_UNKNOWN_MODE,
+ },
+ [SRP_MODE_reserved2] = {
+ .next_index = SRP_INPUT_NEXT_ERROR,
+ .error = SRP_ERROR_UNKNOWN_MODE,
+ },
+ [SRP_MODE_reserved3] = {
+ .next_index = SRP_INPUT_NEXT_ERROR,
+ .error = SRP_ERROR_UNKNOWN_MODE,
+ },
+ [SRP_MODE_keep_alive] = {
+ .next_index = SRP_INPUT_NEXT_ERROR,
+ .error = SRP_ERROR_KEEP_ALIVE_DROPPED,
+ },
+ [SRP_MODE_data] = {
+ .next_index = SRP_INPUT_NEXT_ETHERNET_INPUT,
+ .buffer_advance = sizeof (srp_header_t),
+ },
+ [SRP_MODE_control_pass_to_host] = {
+ .next_index = SRP_INPUT_NEXT_CONTROL,
+ },
+ [SRP_MODE_control_locally_buffered_for_host] = {
+ .next_index = SRP_INPUT_NEXT_CONTROL,
+ },
+};
+
+static uword
+srp_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ srp_main_t * sm = &srp_main;
+ u32 n_left_from, next_index, * from, * to_next;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (srp_input_trace_t));
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1, sw_if_index0, sw_if_index1;
+ vlib_buffer_t * b0, * b1;
+ u8 next0, next1, error0, error1;
+ srp_header_t * s0, * s1;
+ srp_input_disposition_t * d0, * d1;
+ vnet_hw_interface_t * hi0, * hi1;
+ srp_interface_t * si0, * si1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * b2, * b3;
+
+ b2 = vlib_get_buffer (vm, from[2]);
+ b3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (b2, LOAD);
+ vlib_prefetch_buffer_header (b3, LOAD);
+
+ CLIB_PREFETCH (b2->data, sizeof (srp_header_t), LOAD);
+ CLIB_PREFETCH (b3->data, sizeof (srp_header_t), LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ s0 = (void *) (b0->data + b0->current_data);
+ s1 = (void *) (b1->data + b1->current_data);
+
+ /* Data packets are always assigned to side A (outer ring) interface. */
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+ sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];
+
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+ hi1 = vnet_get_sup_hw_interface (vnm, sw_if_index1);
+
+ si0 = pool_elt_at_index (sm->interface_pool, hi0->hw_instance);
+ si1 = pool_elt_at_index (sm->interface_pool, hi1->hw_instance);
+
+ sw_if_index0 = (s0->mode == SRP_MODE_data
+ ? si0->rings[SRP_RING_OUTER].sw_if_index
+ : sw_if_index0);
+ sw_if_index1 = (s1->mode == SRP_MODE_data
+ ? si1->rings[SRP_RING_OUTER].sw_if_index
+ : sw_if_index1);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = sw_if_index0;
+ vnet_buffer (b1)->sw_if_index[VLIB_RX] = sw_if_index1;
+
+ d0 = srp_input_disposition_by_mode + s0->mode;
+ d1 = srp_input_disposition_by_mode + s1->mode;
+
+ next0 = d0->next_index;
+ next1 = d1->next_index;
+
+ error0 = d0->error;
+ error1 = d1->error;
+
+ vlib_buffer_advance (b0, d0->buffer_advance);
+ vlib_buffer_advance (b1, d1->buffer_advance);
+
+ b0->error = node->errors[error0];
+ b1->error = node->errors[error1];
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, sw_if_index0;
+ vlib_buffer_t * b0;
+ u8 next0, error0;
+ srp_header_t * s0;
+ srp_input_disposition_t * d0;
+ srp_interface_t * si0;
+ vnet_hw_interface_t * hi0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ s0 = (void *) (b0->data + b0->current_data);
+
+ /* Data packets are always assigned to side A (outer ring) interface. */
+ sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
+
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ si0 = pool_elt_at_index (sm->interface_pool, hi0->hw_instance);
+
+ sw_if_index0 = (s0->mode == SRP_MODE_data
+ ? si0->rings[SRP_RING_OUTER].sw_if_index
+ : sw_if_index0);
+
+ vnet_buffer (b0)->sw_if_index[VLIB_RX] = sw_if_index0;
+
+ d0 = srp_input_disposition_by_mode + s0->mode;
+
+ next0 = d0->next_index;
+
+ error0 = d0->error;
+
+ vlib_buffer_advance (b0, d0->buffer_advance);
+
+ b0->error = node->errors[error0];
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * srp_error_strings[] = {
+#define _(f,s) s,
+ foreach_srp_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (srp_input_node,static) = {
+ .function = srp_input,
+ .name = "srp-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = SRP_N_ERROR,
+ .error_strings = srp_error_strings,
+
+ .n_next_nodes = SRP_INPUT_N_NEXT,
+ .next_nodes = {
+ [SRP_INPUT_NEXT_ERROR] = "error-drop",
+ [SRP_INPUT_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [SRP_INPUT_NEXT_CONTROL] = "srp-control",
+ },
+
+ .format_buffer = format_srp_header_with_length,
+ .format_trace = format_srp_input_trace,
+ .unformat_buffer = unformat_srp_header,
+};
+
+static uword
+srp_topology_packet (vlib_main_t * vm, u32 sw_if_index, u8 ** contents)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vnet_hw_interface_t * hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ srp_topology_header_t * t;
+ srp_topology_mac_binding_t * mb;
+ u32 nb, nmb;
+
+ t = (void *) *contents;
+
+ nb = clib_net_to_host_u16 (t->n_bytes_of_data_that_follows);
+ nmb = (nb - sizeof (t->originator_address)) / sizeof (mb[0]);
+ if (vec_len (*contents) < sizeof (t[0]) + nmb * sizeof (mb[0]))
+ return SRP_ERROR_TOPOLOGY_BAD_LENGTH;
+
+ /* Fill in our source MAC address. */
+ memcpy (t->ethernet.src_address, hi->hw_address, vec_len (hi->hw_address));
+
+ /* Make space for our MAC binding. */
+ vec_resize (*contents, sizeof (srp_topology_mac_binding_t));
+ t = (void *) *contents;
+ t->n_bytes_of_data_that_follows = clib_host_to_net_u16 (nb + sizeof (mb[0]));
+
+ mb = t->bindings + nmb;
+
+ mb->flags =
+ ((t->srp.is_inner_ring ? SRP_TOPOLOGY_MAC_BINDING_FLAG_IS_INNER_RING : 0)
+ | (/* is wrapped FIXME */ 0));
+ memcpy (mb->address, hi->hw_address, vec_len (hi->hw_address));
+
+ t->control.checksum
+ = ~ip_csum_fold (ip_incremental_checksum (0, &t->control,
+ vec_len (*contents) - STRUCT_OFFSET_OF (srp_generic_control_header_t, control)));
+
+ {
+ vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index);
+ vlib_buffer_t * b;
+ u32 * to_next = vlib_frame_vector_args (f);
+ u32 bi;
+
+ bi = vlib_buffer_add_data (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX,
+ /* buffer to append to */ 0,
+ *contents, vec_len (*contents));
+ b = vlib_get_buffer (vm, bi);
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index;
+ to_next[0] = bi;
+ f->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi->output_node_index, f);
+ }
+
+ return SRP_ERROR_CONTROL_PACKETS_PROCESSED;
+}
+
+typedef uword (srp_control_handler_function_t) (vlib_main_t * vm,
+ u32 sw_if_index,
+ u8 ** contents);
+
+static uword
+srp_control_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ vlib_node_runtime_t * error_node;
+ static u8 * contents;
+
+ error_node = vlib_node_get_runtime (vm, srp_input_node.index);
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ if (node->flags & VLIB_NODE_FLAG_TRACE)
+ vlib_trace_frame_buffers_only (vm, node,
+ from,
+ n_left_from,
+ sizeof (from[0]),
+ sizeof (srp_input_trace_t));
+
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0, l2_len0, l3_len0;
+ vlib_buffer_t * b0;
+ u8 next0, error0;
+ srp_generic_control_header_t * s0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_to_next -= 1;
+ n_left_from -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ s0 = (void *) (b0->data + b0->current_data);
+ l2_len0 = vlib_buffer_length_in_chain (vm, b0);
+ l3_len0 = l2_len0 - STRUCT_OFFSET_OF (srp_generic_control_header_t, control);
+
+ error0 = SRP_ERROR_CONTROL_PACKETS_PROCESSED;
+
+ error0 = s0->control.version != 0 ? SRP_ERROR_CONTROL_VERSION_NON_ZERO : error0;
+
+ {
+ u16 save0 = s0->control.checksum;
+ u16 computed0;
+ s0->control.checksum = 0;
+ computed0 = ~ip_csum_fold (ip_incremental_checksum (0, &s0->control, l3_len0));
+ error0 = save0 != computed0 ? SRP_ERROR_CONTROL_BAD_CHECKSUM : error0;
+ }
+
+ if (error0 == SRP_ERROR_CONTROL_PACKETS_PROCESSED)
+ {
+ static srp_control_handler_function_t * t[SRP_N_CONTROL_PACKET_TYPE] = {
+ [SRP_CONTROL_PACKET_TYPE_topology] = srp_topology_packet,
+ };
+ srp_control_handler_function_t * f;
+
+ f = 0;
+ if (s0->control.type < ARRAY_LEN (t))
+ f = t[s0->control.type];
+
+ if (f)
+ {
+ vec_validate (contents, l2_len0 - 1);
+ vlib_buffer_contents (vm, bi0, contents);
+ error0 = f (vm, vnet_buffer (b0)->sw_if_index[VLIB_RX], &contents);
+ }
+ else
+ error0 = SRP_ERROR_UNKNOWN_CONTROL;
+ }
+
+ b0->error = error_node->errors[error0];
+ next0 = 0;
+
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (srp_control_input_node,static) = {
+ .function = srp_control_input,
+ .name = "srp-control",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_next_nodes = 1,
+ .next_nodes = {
+ [0] = "error-drop",
+ },
+
+ .format_buffer = format_srp_header_with_length,
+ .format_trace = format_srp_input_trace,
+ .unformat_buffer = unformat_srp_header,
+};
+
+static u8 * format_srp_ips_request_type (u8 * s, va_list * args)
+{
+ u32 x = va_arg (*args, u32);
+ char * t = 0;
+ switch (x)
+ {
+#define _(f,n) case SRP_IPS_REQUEST_##f: t = #f; break;
+ foreach_srp_ips_request_type
+#undef _
+ default:
+ return format (s, "unknown 0x%x", x);
+ }
+ return format (s, "%U", format_c_identifier, t);
+}
+
+static u8 * format_srp_ips_status (u8 * s, va_list * args)
+{
+ u32 x = va_arg (*args, u32);
+ char * t = 0;
+ switch (x)
+ {
+#define _(f,n) case SRP_IPS_STATUS_##f: t = #f; break;
+ foreach_srp_ips_status
+#undef _
+ default:
+ return format (s, "unknown 0x%x", x);
+ }
+ return format (s, "%U", format_c_identifier, t);
+}
+
+static u8 * format_srp_ips_state (u8 * s, va_list * args)
+{
+ u32 x = va_arg (*args, u32);
+ char * t = 0;
+ switch (x)
+ {
+#define _(f) case SRP_IPS_STATE_##f: t = #f; break;
+ foreach_srp_ips_state
+#undef _
+ default:
+ return format (s, "unknown 0x%x", x);
+ }
+ return format (s, "%U", format_c_identifier, t);
+}
+
+static u8 * format_srp_ring (u8 * s, va_list * args)
+{
+ u32 ring = va_arg (*args, u32);
+ return format (s, "%s", ring == SRP_RING_INNER ? "inner" : "outer");
+}
+
+static u8 * format_srp_ips_header (u8 * s, va_list * args)
+{
+ srp_ips_header_t * h = va_arg (*args, srp_ips_header_t *);
+
+ s = format (s, "%U, %U, %U, %s-path",
+ format_srp_ips_request_type, h->request_type,
+ format_ethernet_address, h->originator_address,
+ format_srp_ips_status, h->status,
+ h->is_long_path ? "long" : "short");
+
+ return s;
+}
+
+static u8 * format_srp_interface (u8 * s, va_list * args)
+{
+ srp_interface_t * si = va_arg (*args, srp_interface_t *);
+ srp_interface_ring_t * ir;
+
+ s = format (s, "address %U, IPS state %U",
+ format_ethernet_address, si->my_address,
+ format_srp_ips_state, si->current_ips_state);
+ for (ir = si->rings; ir < si->rings + SRP_N_RING; ir++)
+ if (ir->rx_neighbor_address_valid)
+ s = format (s, ", %U neighbor %U",
+ format_srp_ring, ir->ring,
+ format_ethernet_address, ir->rx_neighbor_address);
+
+ return s;
+}
+
+u8 * format_srp_device (u8 * s, va_list * args)
+{
+ u32 hw_if_index = va_arg (*args, u32);
+ CLIB_UNUSED (int verbose) = va_arg (*args, int);
+ vnet_main_t * vnm = vnet_get_main();
+ srp_main_t * sm = &srp_main;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, hw_if_index);
+ srp_interface_t * si = pool_elt_at_index (sm->interface_pool, hi->hw_instance);
+ return format (s, "%U", format_srp_interface, si);
+}
+
+always_inline srp_interface_t *
+srp_get_interface (u32 sw_if_index, srp_ring_type_t * ring)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ srp_main_t * sm = &srp_main;
+ vnet_hw_interface_t * hi = vnet_get_sup_hw_interface (vnm, sw_if_index);
+ srp_interface_t * si;
+
+ ASSERT (hi->hw_class_index == srp_hw_interface_class.index);
+ si = pool_elt_at_index (sm->interface_pool, hi->hw_instance);
+
+ ASSERT (si->rings[SRP_RING_INNER].hw_if_index == hi->hw_if_index
+ || si->rings[SRP_RING_OUTER].hw_if_index == hi->hw_if_index);
+ if (ring)
+ *ring =
+ (hi->hw_if_index == si->rings[SRP_RING_INNER].hw_if_index
+ ? SRP_RING_INNER
+ : SRP_RING_OUTER);
+
+ return si;
+}
+
+static void init_ips_packet (srp_interface_t * si,
+ srp_ring_type_t tx_ring,
+ srp_ips_header_t * i)
+{
+ memset (i, 0, sizeof (i[0]));
+
+ i->srp.ttl = 1;
+ i->srp.is_inner_ring = tx_ring;
+ i->srp.priority = 7;
+ i->srp.mode = SRP_MODE_control_locally_buffered_for_host;
+ srp_header_compute_parity (&i->srp);
+
+ memcpy (&i->ethernet.src_address, &si->my_address, sizeof (si->my_address));
+ i->ethernet.type = clib_host_to_net_u16 (ETHERNET_TYPE_SRP_CONTROL);
+
+ /* Checksum will be filled in later. */
+ i->control.version = 0;
+ i->control.type = SRP_CONTROL_PACKET_TYPE_ips;
+ i->control.ttl = 255;
+
+ memcpy (&i->originator_address, &si->my_address, sizeof (si->my_address));
+}
+
+static void tx_ips_packet (srp_interface_t * si,
+ srp_ring_type_t tx_ring,
+ srp_ips_header_t * i)
+{
+ srp_main_t * sm = &srp_main;
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = sm->vlib_main;
+ vnet_hw_interface_t * hi = vnet_get_hw_interface (vnm, si->rings[tx_ring].hw_if_index);
+ vlib_frame_t * f;
+ vlib_buffer_t * b;
+ u32 * to_next, bi;
+
+ if (! vnet_sw_interface_is_admin_up (vnm, hi->sw_if_index))
+ return;
+ if (hi->hw_class_index != srp_hw_interface_class.index)
+ return;
+
+ i->control.checksum
+ = ~ip_csum_fold (ip_incremental_checksum (0, &i->control,
+ sizeof (i[0]) - STRUCT_OFFSET_OF (srp_ips_header_t, control)));
+
+ bi = vlib_buffer_add_data (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX,
+ /* buffer to append to */ 0,
+ i, sizeof (i[0]));
+
+ /* FIXME trace. */
+ if (0)
+ clib_warning ("%U %U",
+ format_vnet_sw_if_index_name, vnm, hi->sw_if_index,
+ format_srp_ips_header, i);
+
+ b = vlib_get_buffer (vm, bi);
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = hi->sw_if_index;
+
+ f = vlib_get_frame_to_node (vm, hi->output_node_index);
+ to_next = vlib_frame_vector_args (f);
+ to_next[0] = bi;
+ f->n_vectors = 1;
+ vlib_put_frame_to_node (vm, hi->output_node_index, f);
+}
+
+static void serialize_srp_interface_state_msg (serialize_main_t * m, va_list * va)
+{
+ srp_interface_t * si = va_arg (*va, srp_interface_t *);
+ srp_main_t * sm = &srp_main;
+ int r;
+
+ ASSERT (! pool_is_free (sm->interface_pool, si));
+ serialize_integer (m, si - sm->interface_pool, sizeof (u32));
+ serialize_likely_small_unsigned_integer (m, si->current_ips_state);
+ for (r = 0; r < SRP_N_RING; r++)
+ {
+ srp_interface_ring_t * ir = &si->rings[r];
+ void * p;
+ serialize_likely_small_unsigned_integer (m, ir->rx_neighbor_address_valid);
+ if (ir->rx_neighbor_address_valid)
+ {
+ p = serialize_get (m, sizeof (ir->rx_neighbor_address));
+ memcpy (p, ir->rx_neighbor_address, sizeof (ir->rx_neighbor_address));
+ }
+ serialize_likely_small_unsigned_integer (m, ir->waiting_to_restore);
+ if (ir->waiting_to_restore)
+ serialize (m, serialize_f64, ir->wait_to_restore_start_time);
+ }
+}
+
+static void unserialize_srp_interface_state_msg (serialize_main_t * m, va_list * va)
+{
+ CLIB_UNUSED (mc_main_t * mcm) = va_arg (*va, mc_main_t *);
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si;
+ u32 si_index, r;
+
+ unserialize_integer (m, &si_index, sizeof (u32));
+ si = pool_elt_at_index (sm->interface_pool, si_index);
+ si->current_ips_state = unserialize_likely_small_unsigned_integer (m);
+ for (r = 0; r < SRP_N_RING; r++)
+ {
+ srp_interface_ring_t * ir = &si->rings[r];
+ void * p;
+ ir->rx_neighbor_address_valid = unserialize_likely_small_unsigned_integer (m);
+ if (ir->rx_neighbor_address_valid)
+ {
+ p = unserialize_get (m, sizeof (ir->rx_neighbor_address));
+ memcpy (ir->rx_neighbor_address, p, sizeof (ir->rx_neighbor_address));
+ }
+ ir->waiting_to_restore = unserialize_likely_small_unsigned_integer (m);
+ if (ir->waiting_to_restore)
+ unserialize (m, unserialize_f64, &ir->wait_to_restore_start_time);
+ }
+}
+
+MC_SERIALIZE_MSG (srp_interface_state_msg, static) = {
+ .name = "vnet_srp_interface_state",
+ .serialize = serialize_srp_interface_state_msg,
+ .unserialize = unserialize_srp_interface_state_msg,
+};
+
+static int requests_switch (srp_ips_request_type_t r)
+{
+ static u8 t[16] = {
+ [SRP_IPS_REQUEST_forced_switch] = 1,
+ [SRP_IPS_REQUEST_manual_switch] = 1,
+ [SRP_IPS_REQUEST_signal_fail] = 1,
+ [SRP_IPS_REQUEST_signal_degrade] = 1,
+ };
+ return r < ARRAY_LEN (t) ? t[r] : 0;
+}
+
+/* Called when an IPS control packet is received on given interface. */
+void srp_ips_rx_packet (u32 sw_if_index, srp_ips_header_t * h)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vlib_main_t * vm = srp_main.vlib_main;
+ srp_ring_type_t rx_ring;
+ srp_interface_t * si = srp_get_interface (sw_if_index, &rx_ring);
+ srp_interface_ring_t * ir = &si->rings[rx_ring];
+ int si_needs_broadcast = 0;
+
+ /* FIXME trace. */
+ if (0)
+ clib_warning ("%U %U %U",
+ format_time_interval, "h:m:s:u", vlib_time_now (vm),
+ format_vnet_sw_if_index_name, vnm, sw_if_index,
+ format_srp_ips_header, h);
+
+ /* Ignore self-generated IPS packets. */
+ if (! memcmp (h->originator_address, si->my_address, sizeof (h->originator_address)))
+ goto done;
+
+ /* Learn neighbor address from short path messages. */
+ if (! h->is_long_path)
+ {
+ if (ir->rx_neighbor_address_valid
+ && memcmp (ir->rx_neighbor_address, h->originator_address, sizeof (ir->rx_neighbor_address)))
+ {
+ ASSERT (0);
+ }
+ ir->rx_neighbor_address_valid = 1;
+ memcpy (ir->rx_neighbor_address, h->originator_address, sizeof (ir->rx_neighbor_address));
+ }
+
+ switch (si->current_ips_state)
+ {
+ case SRP_IPS_STATE_idle:
+ /* Received {REQ,NEIGHBOR,W,S} in idle state: wrap. */
+ if (requests_switch (h->request_type)
+ && ! h->is_long_path
+ && h->status == SRP_IPS_STATUS_wrapped)
+ {
+ srp_ips_header_t to_tx[2];
+
+ si_needs_broadcast = 1;
+ si->current_ips_state = SRP_IPS_STATE_wrapped;
+ si->hw_wrap_function (si->rings[SRP_SIDE_A].hw_if_index, /* enable_wrap */ 1);
+ si->hw_wrap_function (si->rings[SRP_SIDE_B].hw_if_index, /* enable_wrap */ 1);
+
+ init_ips_packet (si, rx_ring ^ 0, &to_tx[0]);
+ to_tx[0].request_type = SRP_IPS_REQUEST_idle;
+ to_tx[0].status = SRP_IPS_STATUS_wrapped;
+ to_tx[0].is_long_path = 0;
+ tx_ips_packet (si, rx_ring ^ 0, &to_tx[0]);
+
+ init_ips_packet (si, rx_ring ^ 1, &to_tx[1]);
+ to_tx[1].request_type = h->request_type;
+ to_tx[1].status = SRP_IPS_STATUS_wrapped;
+ to_tx[1].is_long_path = 1;
+ tx_ips_packet (si, rx_ring ^ 1, &to_tx[1]);
+ }
+ break;
+
+ case SRP_IPS_STATE_wrapped:
+ if (! h->is_long_path
+ && h->request_type == SRP_IPS_REQUEST_idle
+ && h->status == SRP_IPS_STATUS_idle)
+ {
+ si_needs_broadcast = 1;
+ si->current_ips_state = SRP_IPS_STATE_idle;
+ si->hw_wrap_function (si->rings[SRP_SIDE_A].hw_if_index, /* enable_wrap */ 0);
+ si->hw_wrap_function (si->rings[SRP_SIDE_B].hw_if_index, /* enable_wrap */ 0);
+ }
+ break;
+
+ case SRP_IPS_STATE_pass_thru:
+ /* FIXME */
+ break;
+
+ default:
+ abort ();
+ break;
+ }
+
+ done:
+ if (vm->mc_main && si_needs_broadcast)
+ mc_serialize (vm->mc_main, &srp_interface_state_msg, si);
+}
+
+/* Preform local IPS request on given interface. */
+void srp_ips_local_request (u32 sw_if_index, srp_ips_request_type_t request)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ srp_main_t * sm = &srp_main;
+ vlib_main_t * vm = sm->vlib_main;
+ srp_ring_type_t rx_ring;
+ srp_interface_t * si = srp_get_interface (sw_if_index, &rx_ring);
+ srp_interface_ring_t * ir = &si->rings[rx_ring];
+ int si_needs_broadcast = 0;
+
+ if (request == SRP_IPS_REQUEST_wait_to_restore)
+ {
+ if (si->current_ips_state != SRP_IPS_STATE_wrapped)
+ return;
+ if (! ir->waiting_to_restore)
+ {
+ ir->wait_to_restore_start_time = vlib_time_now (sm->vlib_main);
+ ir->waiting_to_restore = 1;
+ si_needs_broadcast = 1;
+ }
+ }
+ else
+ {
+ /* FIXME handle local signal fail. */
+ si_needs_broadcast = ir->waiting_to_restore;
+ ir->wait_to_restore_start_time = 0;
+ ir->waiting_to_restore = 0;
+ }
+
+ /* FIXME trace. */
+ if (0)
+ clib_warning ("%U %U",
+ format_vnet_sw_if_index_name, vnm, sw_if_index,
+ format_srp_ips_request_type, request);
+
+ if (vm->mc_main && si_needs_broadcast)
+ mc_serialize (vm->mc_main, &srp_interface_state_msg, si);
+}
+
+static void maybe_send_ips_message (srp_interface_t * si)
+{
+ srp_main_t * sm = &srp_main;
+ srp_ips_header_t to_tx[2];
+ srp_ring_type_t rx_ring = SRP_RING_OUTER;
+ srp_interface_ring_t * r0 = &si->rings[rx_ring ^ 0];
+ srp_interface_ring_t * r1 = &si->rings[rx_ring ^ 1];
+ f64 now = vlib_time_now (sm->vlib_main);
+
+ if (! si->ips_process_enable)
+ return;
+
+ if (si->current_ips_state == SRP_IPS_STATE_wrapped
+ && r0->waiting_to_restore
+ && r1->waiting_to_restore
+ && now >= r0->wait_to_restore_start_time + si->config.wait_to_restore_idle_delay
+ && now >= r1->wait_to_restore_start_time + si->config.wait_to_restore_idle_delay)
+ {
+ si->current_ips_state = SRP_IPS_STATE_idle;
+ r0->waiting_to_restore = r1->waiting_to_restore = 0;
+ r0->wait_to_restore_start_time = r1->wait_to_restore_start_time = 0;
+ }
+
+ if (si->current_ips_state != SRP_IPS_STATE_idle)
+ return;
+
+ init_ips_packet (si, rx_ring ^ 0, &to_tx[0]);
+ init_ips_packet (si, rx_ring ^ 1, &to_tx[1]);
+
+ if (si->current_ips_state == SRP_IPS_STATE_idle)
+ {
+ to_tx[0].request_type = to_tx[1].request_type = SRP_IPS_REQUEST_idle;
+ to_tx[0].status = to_tx[1].status = SRP_IPS_STATUS_idle;
+ to_tx[0].is_long_path = to_tx[1].is_long_path = 0;
+ }
+
+ else if (si->current_ips_state == SRP_IPS_STATE_wrapped)
+ {
+ to_tx[0].request_type =
+ (si->rings[rx_ring ^ 0].waiting_to_restore
+ ? SRP_IPS_REQUEST_wait_to_restore
+ : SRP_IPS_REQUEST_signal_fail);
+ to_tx[1].request_type =
+ (si->rings[rx_ring ^ 1].waiting_to_restore
+ ? SRP_IPS_REQUEST_wait_to_restore
+ : SRP_IPS_REQUEST_signal_fail);
+ to_tx[0].status = to_tx[1].status = SRP_IPS_STATUS_wrapped;
+ to_tx[0].is_long_path = 0;
+ to_tx[1].is_long_path = 1;
+ }
+
+ tx_ips_packet (si, rx_ring ^ 0, &to_tx[0]);
+ tx_ips_packet (si, rx_ring ^ 1, &to_tx[1]);
+}
+
+static uword
+srp_ips_process (vlib_main_t * vm,
+ vlib_node_runtime_t * rt,
+ vlib_frame_t * f)
+{
+ srp_main_t * sm = &srp_main;
+ srp_interface_t * si;
+
+ while (1)
+ {
+ pool_foreach (si, sm->interface_pool, ({
+ maybe_send_ips_message (si);
+ }));
+ vlib_process_suspend (vm, 1.0);
+ }
+
+ return 0;
+}
+
+VLIB_REGISTER_NODE (srp_ips_process_node) = {
+ .function = srp_ips_process,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "srp-ips-process",
+ .state = VLIB_NODE_STATE_DISABLED,
+};
+
+static clib_error_t * srp_init (vlib_main_t * vm)
+{
+ srp_main_t * sm = &srp_main;
+
+ sm->default_data_ttl = 255;
+ sm->vlib_main = vm;
+ srp_setup_node (vm, srp_input_node.index);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (srp_init);
diff --git a/vnet/vnet/srp/packet.h b/vnet/vnet/srp/packet.h
new file mode 100644
index 00000000000..96dab648b32
--- /dev/null
+++ b/vnet/vnet/srp/packet.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * srp/packet.h: srp packet format.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_srp_packet_h
+#define included_srp_packet_h
+
+#include <vppinfra/byte_order.h>
+#include <vppinfra/bitops.h>
+#include <vnet/ethernet/packet.h>
+
+/* SRP version 2. */
+
+#define foreach_srp_mode \
+ _ (reserved0) \
+ _ (reserved1) \
+ _ (reserved2) \
+ _ (reserved3) \
+ _ (control_pass_to_host) \
+ _ (control_locally_buffered_for_host) \
+ _ (keep_alive) \
+ _ (data)
+
+typedef enum {
+#define _(f) SRP_MODE_##f,
+ foreach_srp_mode
+#undef _
+ SRP_N_MODE,
+} srp_mode_t;
+
+typedef union {
+ /* For computing parity bit. */
+ u16 as_u16;
+
+ struct {
+ u8 ttl;
+
+#if CLIB_ARCH_IS_BIG_ENDIAN
+ u8 is_inner_ring : 1;
+ u8 mode : 3;
+ u8 priority : 3;
+ u8 parity : 1;
+#endif
+#if CLIB_ARCH_IS_LITTLE_ENDIAN
+ u8 parity : 1;
+ u8 priority : 3;
+ u8 mode : 3;
+ u8 is_inner_ring : 1;
+#endif
+ };
+} srp_header_t;
+
+always_inline void
+srp_header_compute_parity (srp_header_t * h)
+{
+ h->parity = 0;
+ h->parity = count_set_bits (h->as_u16) ^ 1; /* odd parity */
+}
+
+typedef struct {
+ srp_header_t srp;
+ ethernet_header_t ethernet;
+} srp_and_ethernet_header_t;
+
+#define foreach_srp_control_packet_type \
+ _ (reserved) \
+ _ (topology) \
+ _ (ips)
+
+typedef enum {
+#define _(f) SRP_CONTROL_PACKET_TYPE_##f,
+ foreach_srp_control_packet_type
+#undef _
+ SRP_N_CONTROL_PACKET_TYPE,
+} srp_control_packet_type_t;
+
+typedef CLIB_PACKED (struct {
+ /* Set to 0. */
+ u8 version;
+
+ srp_control_packet_type_t type : 8;
+
+ /* IP4-like checksum of packet starting with start of control header. */
+ u16 checksum;
+
+ u16 ttl;
+}) srp_control_header_t;
+
+typedef struct {
+ srp_header_t srp;
+ ethernet_header_t ethernet;
+ srp_control_header_t control;
+} srp_generic_control_header_t;
+
+typedef struct {
+ u8 flags;
+#define SRP_TOPOLOGY_MAC_BINDING_FLAG_IS_INNER_RING (1 << 6)
+#define SRP_TOPOLOGY_MAC_BINDING_FLAG_IS_WRAPPED (1 << 5)
+
+ /* MAC address. */
+ u8 address[6];
+} srp_topology_mac_binding_t;
+
+typedef CLIB_PACKED (struct {
+ srp_header_t srp;
+ ethernet_header_t ethernet;
+ srp_control_header_t control;
+
+ /* Length in bytes of data that follows. */
+ u16 n_bytes_of_data_that_follows;
+
+ /* MAC address of originator of this topology request. */
+ u8 originator_address[6];
+
+ /* Bindings follow. */
+ srp_topology_mac_binding_t bindings[0];
+}) srp_topology_header_t;
+
+#define foreach_srp_ips_request_type \
+ _ (idle, 0x0) \
+ _ (wait_to_restore, 0x5) \
+ _ (manual_switch, 0x6) \
+ _ (signal_degrade, 0x8) \
+ _ (signal_fail, 0xb) \
+ _ (forced_switch, 0xd)
+
+typedef enum {
+#define _(f,n) SRP_IPS_REQUEST_##f = n,
+ foreach_srp_ips_request_type
+#undef _
+} srp_ips_request_type_t;
+
+#define foreach_srp_ips_status \
+ _ (idle, 0x0) \
+ _ (wrapped, 0x2)
+
+typedef enum {
+#define _(f,n) SRP_IPS_STATUS_##f = n,
+ foreach_srp_ips_status
+#undef _
+} srp_ips_status_t;
+
+typedef struct {
+ srp_header_t srp;
+ ethernet_header_t ethernet;
+ srp_control_header_t control;
+ u8 originator_address[6];
+
+ union {
+ u8 ips_octet;
+
+ struct {
+#if CLIB_ARCH_IS_BIG_ENDIAN
+ u8 request_type : 4;
+ u8 is_long_path : 1;
+ u8 status : 3;
+#endif
+#if CLIB_ARCH_IS_LITTLE_ENDIAN
+ u8 status : 3;
+ u8 is_long_path : 1;
+ u8 request_type : 4;
+#endif
+ };
+ };
+
+ u8 reserved;
+} srp_ips_header_t;
+
+#endif /* included_srp_packet_h */
diff --git a/vnet/vnet/srp/pg.c b/vnet/vnet/srp/pg.c
new file mode 100644
index 00000000000..54f1a3bba18
--- /dev/null
+++ b/vnet/vnet/srp/pg.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * srp/pg.c: packet generator srp interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/srp/srp.h>
+#include <vnet/ethernet/ethernet.h>
+
+typedef struct {
+ pg_edit_t ttl;
+ pg_edit_t is_inner_ring;
+ pg_edit_t mode;
+ pg_edit_t priority;
+ pg_edit_t parity;
+ pg_edit_t type;
+ pg_edit_t src_address;
+ pg_edit_t dst_address;
+} pg_srp_header_t;
+
+static inline void
+pg_srp_header_init (pg_srp_header_t * e)
+{
+ pg_edit_init (&e->ttl, srp_and_ethernet_header_t, srp.ttl);
+ pg_edit_init_bitfield (&e->is_inner_ring, srp_and_ethernet_header_t,
+ srp.as_u16,
+ 7, 1);
+ pg_edit_init_bitfield (&e->mode, srp_and_ethernet_header_t,
+ srp.as_u16,
+ 4, 3);
+ pg_edit_init_bitfield (&e->priority, srp_and_ethernet_header_t,
+ srp.as_u16,
+ 1, 3);
+ pg_edit_init_bitfield (&e->parity, srp_and_ethernet_header_t,
+ srp.as_u16,
+ 0, 1);
+ pg_edit_init (&e->type, srp_and_ethernet_header_t, ethernet.type);
+ pg_edit_init (&e->src_address, srp_and_ethernet_header_t, ethernet.src_address);
+ pg_edit_init (&e->dst_address, srp_and_ethernet_header_t, ethernet.dst_address);
+}
+
+uword
+unformat_pg_srp_header (unformat_input_t * input, va_list * args)
+{
+ pg_stream_t * s = va_arg (*args, pg_stream_t *);
+ pg_srp_header_t * e;
+ u32 error, group_index;
+
+ e = pg_create_edit_group (s, sizeof (e[0]), sizeof (srp_header_t),
+ &group_index);
+ pg_srp_header_init (e);
+
+ error = 1;
+ if (! unformat (input, "%U: %U -> %U",
+ unformat_pg_edit,
+ unformat_ethernet_type_net_byte_order, &e->type,
+ unformat_pg_edit,
+ unformat_ethernet_address, &e->src_address,
+ unformat_pg_edit,
+ unformat_ethernet_address, &e->dst_address))
+ goto done;
+
+ {
+ srp_header_t h;
+
+ h.as_u16 = 0;
+ h.mode = SRP_MODE_data;
+ h.ttl = 255;
+ h.parity = count_set_bits (h.as_u16) ^ 1;
+
+ pg_edit_set_fixed (&e->mode, h.mode);
+ pg_edit_set_fixed (&e->ttl, h.ttl);
+ pg_edit_set_fixed (&e->is_inner_ring, h.is_inner_ring);
+ pg_edit_set_fixed (&e->priority, h.priority);
+ pg_edit_set_fixed (&e->parity, h.parity);
+ }
+
+ error = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "mode %U",
+ unformat_pg_edit,
+ unformat_pg_number, &e->mode))
+ ;
+ else if (unformat (input, "ttl %U",
+ unformat_pg_edit,
+ unformat_pg_number, &e->ttl))
+ ;
+ else if (unformat (input, "priority %U",
+ unformat_pg_edit,
+ unformat_pg_number, &e->priority))
+ ;
+ else
+ break;
+ }
+
+ {
+ ethernet_main_t * em = &ethernet_main;
+ ethernet_type_info_t * ti = 0;
+ pg_node_t * pg_node = 0;
+
+ if (e->type.type == PG_EDIT_FIXED)
+ {
+ u16 t = *(u16 *) e->type.values[PG_EDIT_LO];
+ ti = ethernet_get_type_info (em, clib_net_to_host_u16 (t));
+ if (ti && ti->node_index != ~0)
+ pg_node = pg_get_node (ti->node_index);
+ }
+
+ if (pg_node && pg_node->unformat_edit
+ && unformat_user (input, pg_node->unformat_edit, s))
+ ;
+ else if (! unformat_user (input, unformat_pg_payload, s))
+ goto done;
+ }
+
+ done:
+ if (error)
+ pg_free_edit_group (s);
+ return error == 0;
+}
+
diff --git a/vnet/vnet/srp/srp.h b/vnet/vnet/srp/srp.h
new file mode 100644
index 00000000000..1b241710409
--- /dev/null
+++ b/vnet/vnet/srp/srp.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * srp.h: types/functions for srp.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_srp_h
+#define included_srp_h
+
+#include <vnet/vnet.h>
+#include <vnet/srp/packet.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/pg/pg.h>
+
+extern vnet_hw_interface_class_t srp_hw_interface_class;
+
+/* See RFC 2892. */
+#define foreach_srp_ips_state \
+ _ (idle) \
+ _ (pass_thru) \
+ _ (wrapped)
+
+typedef enum {
+#define _(f) SRP_IPS_STATE_##f,
+ foreach_srp_ips_state
+#undef _
+ SRP_N_IPS_STATE,
+} srp_ips_state_t;
+
+typedef enum {
+ SRP_RING_OUTER,
+ SRP_RING_INNER,
+ SRP_N_RING = 2,
+ SRP_SIDE_A = SRP_RING_OUTER, /* outer rx, inner tx */
+ SRP_SIDE_B = SRP_RING_INNER, /* inner rx, outer tx */
+ SRP_N_SIDE = 2,
+} srp_ring_type_t;
+
+typedef struct {
+ srp_ring_type_t ring;
+
+ /* Hardware interface for this ring/side. */
+ u32 hw_if_index;
+
+ /* Software interface corresponding to hardware interface. */
+ u32 sw_if_index;
+
+ /* Mac address of neighbor on RX fiber. */
+ u8 rx_neighbor_address[6];
+
+ u8 rx_neighbor_address_valid;
+
+ /* True if we are waiting to restore signal. */
+ u8 waiting_to_restore;
+
+ /* Time stamp when signal became valid. */
+ f64 wait_to_restore_start_time;
+} srp_interface_ring_t;
+
+struct srp_interface_t;
+typedef void (srp_hw_wrap_function_t) (u32 hw_if_index, u32 wrap_enable);
+typedef void (srp_hw_enable_function_t) (struct srp_interface_t * si, u32 wrap_enable);
+
+typedef struct {
+ /* Delay between wait to restore event and entering idle state in seconds. */
+ f64 wait_to_restore_idle_delay;
+
+ /* Number of seconds between sending ips messages to neighbors. */
+ f64 ips_tx_interval;
+} srp_interface_config_t;
+
+typedef struct srp_interface_t {
+ /* Current IPS state. */
+ srp_ips_state_t current_ips_state;
+
+ /* Address for this interface. */
+ u8 my_address[6];
+
+ /* Enable IPS process handling for this interface. */
+ u8 ips_process_enable;
+
+ srp_interface_ring_t rings[SRP_N_RING];
+
+ /* Configurable parameters. */
+ srp_interface_config_t config;
+
+ srp_hw_wrap_function_t * hw_wrap_function;
+
+ srp_hw_enable_function_t * hw_enable_function;
+} srp_interface_t;
+
+typedef struct {
+ vlib_main_t * vlib_main;
+
+ /* Pool of SRP interfaces. */
+ srp_interface_t * interface_pool;
+
+ uword * interface_index_by_hw_if_index;
+
+ /* TTL to use for outgoing data packets. */
+ u32 default_data_ttl;
+
+ vlib_one_time_waiting_process_t * srp_register_interface_waiting_process_pool;
+
+ uword * srp_register_interface_waiting_process_pool_index_by_hw_if_index;
+} srp_main_t;
+
+/* Registers sides A/B hardware interface as being SRP capable. */
+void srp_register_interface (u32 * hw_if_indices);
+
+/* Enable sending IPS messages for interface implied by given vlib hardware interface. */
+void srp_interface_enable_ips (u32 hw_if_index);
+
+/* Set function to wrap hardware side of SRP interface. */
+void srp_interface_set_hw_wrap_function (u32 hw_if_index, srp_hw_wrap_function_t * f);
+
+void srp_interface_set_hw_enable_function (u32 hw_if_index, srp_hw_enable_function_t * f);
+
+vlib_node_registration_t srp_ips_process_node;
+
+/* Called when an IPS control packet is received on given interface. */
+void srp_ips_rx_packet (u32 sw_if_index, srp_ips_header_t * ips_packet);
+
+/* Preform local IPS request on given interface. */
+void srp_ips_local_request (u32 sw_if_index, srp_ips_request_type_t request);
+
+always_inline void
+srp_ips_link_change (u32 sw_if_index, u32 link_is_up)
+{
+ srp_ips_local_request (sw_if_index,
+ link_is_up
+ ? SRP_IPS_REQUEST_wait_to_restore
+ : SRP_IPS_REQUEST_signal_fail);
+}
+
+void srp_interface_get_interface_config (u32 hw_if_index, srp_interface_config_t * c);
+void srp_interface_set_interface_config (u32 hw_if_index, srp_interface_config_t * c);
+
+srp_main_t srp_main;
+
+always_inline srp_interface_t *
+srp_get_interface_from_vnet_hw_interface (u32 hw_if_index)
+{
+ srp_main_t * sm = &srp_main;
+ uword * p = hash_get (sm->interface_index_by_hw_if_index, hw_if_index);
+ return p ? pool_elt_at_index (sm->interface_pool, p[0]) : 0;
+}
+
+u8 * format_srp_header (u8 * s, va_list * args);
+u8 * format_srp_header_with_length (u8 * s, va_list * args);
+u8 * format_srp_device (u8 * s, va_list * args);
+
+/* Parse srp header. */
+uword
+unformat_srp_header (unformat_input_t * input, va_list * args);
+
+uword unformat_pg_srp_header (unformat_input_t * input, va_list * args);
+
+always_inline void
+srp_setup_node (vlib_main_t * vm, u32 node_index)
+{
+ vlib_node_t * n = vlib_get_node (vm, node_index);
+ pg_node_t * pn = pg_get_node (node_index);
+ n->format_buffer = format_srp_header_with_length;
+ n->unformat_buffer = unformat_srp_header;
+ pn->unformat_edit = unformat_pg_srp_header;
+}
+
+#define foreach_srp_error \
+ _ (NONE, "no error") \
+ _ (UNKNOWN_MODE, "unknown mode in SRP header") \
+ _ (KEEP_ALIVE_DROPPED, "v1 keep alive mode in SRP header") \
+ _ (CONTROL_PACKETS_PROCESSED, "control packets processed") \
+ _ (IPS_PACKETS_PROCESSED, "IPS packets processed") \
+ _ (UNKNOWN_CONTROL, "unknown control packet") \
+ _ (CONTROL_VERSION_NON_ZERO, "control packet with non-zero version") \
+ _ (CONTROL_BAD_CHECKSUM, "control packet with bad checksum") \
+ _ (TOPOLOGY_BAD_LENGTH, "topology packet with bad length")
+
+typedef enum {
+#define _(n,s) SRP_ERROR_##n,
+ foreach_srp_error
+#undef _
+ SRP_N_ERROR,
+} srp_error_t;
+
+serialize_function_t serialize_srp_main, unserialize_srp_main;
+
+#endif /* included_srp_h */
diff --git a/vnet/vnet/unix/gdb_funcs.c b/vnet/vnet/unix/gdb_funcs.c
new file mode 100644
index 00000000000..9a2e4599a2b
--- /dev/null
+++ b/vnet/vnet/unix/gdb_funcs.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+
+#include <vlib/threads.h>
+
+/* Functions to call from gdb */
+
+u32 vl(void *p)
+{
+ return vec_len (p);
+}
+
+uword pe (void *v)
+{
+ return (pool_elts(v));
+}
+
+int pifi (void *p, u32 index)
+{
+ return pool_is_free_index (p, index);
+}
+
+void debug_hex_bytes (u8 *s, u32 n)
+{
+ fformat (stderr, "%U\n", format_hex_bytes, s, n);
+}
+
+void vlib_dump_frame_ownership (void)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_node_main_t * nm = &vm->node_main;
+ vlib_node_runtime_t * this_node_runtime;
+ vlib_next_frame_t * nf;
+ u32 first_nf_index;
+ u32 index;
+
+ vec_foreach(this_node_runtime, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+ {
+ first_nf_index = this_node_runtime->next_frame_index;
+
+ for (index = first_nf_index; index < first_nf_index +
+ this_node_runtime->n_next_nodes; index++)
+ {
+ vlib_node_runtime_t * owned_runtime;
+ nf = vec_elt_at_index (vm->node_main.next_frames, index);
+ if (nf->flags & VLIB_FRAME_OWNER)
+ {
+ owned_runtime = vec_elt_at_index (nm->nodes_by_type[0],
+ nf->node_runtime_index);
+ fformat(stderr,
+ "%s next index %d owns enqueue rights to %s\n",
+ nm->nodes[this_node_runtime->node_index]->name,
+ index - first_nf_index,
+ nm->nodes[owned_runtime->node_index]->name);
+ fformat (stderr, " nf index %d nf->frame_index %d\n",
+ nf - vm->node_main.next_frames,
+ nf->frame_index);
+ }
+ }
+ }
+}
+
+void vlib_runtime_index_to_node_name (u32 index)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_node_main_t * nm = &vm->node_main;
+
+ if (index > vec_len (nm->nodes))
+ {
+ fformat(stderr, "%d out of range, max %d\n", vec_len(nm->nodes));
+ return;
+ }
+
+ fformat(stderr, "node runtime index %d name %s\n", index, nm->nodes[index]->name);
+}
+
+
+static clib_error_t *
+show_gdb_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vlib_cli_output (vm, "vl(p) returns vec_len(p)");
+ vlib_cli_output (vm, "pe(p) returns pool_elts(p)");
+ vlib_cli_output (vm, "pifi(p, i) returns pool_is_free_index(p, i)");
+ vlib_cli_output (vm, "debug_hex_bytes (ptr, n_bytes) dumps n_bytes in hex");
+ vlib_cli_output (vm, "vlib_dump_frame_ownership() does what it says");
+ vlib_cli_output (vm, "vlib_runtime_index_to_node_name (index) prints NN");
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_gdb_funcs_command, static) = {
+ .path = "show gdb",
+ .short_help = "Describe functions which can be called from gdb",
+ .function = show_gdb_command_fn,
+};
+
+/* Cafeteria plan, maybe you don't want these functions */
+clib_error_t *
+gdb_func_init (vlib_main_t * vm) { return 0; }
+
+VLIB_INIT_FUNCTION (gdb_func_init);
diff --git a/vnet/vnet/unix/pcap.c b/vnet/vnet/unix/pcap.c
new file mode 100644
index 00000000000..16b8443085b
--- /dev/null
+++ b/vnet/vnet/unix/pcap.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pcap.c: libpcap packet capture format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vnet/unix/pcap.h>
+#include <sys/fcntl.h>
+
+/* Usage
+
+#include <vnet/unix/pcap.h>
+
+static pcap_main_t pcap = {
+ .file_name = "/tmp/ip4",
+ .n_packets_to_capture = 2,
+ .packet_type = PCAP_PACKET_TYPE_ip,
+};
+
+To add a buffer:
+
+ pcap_add_buffer (&pcap, vm, pi0, 128);
+
+file will be written after n_packets_to_capture or call to pcap_write (&pcap).
+
+*/
+
+clib_error_t *
+pcap_write (pcap_main_t * pm)
+{
+ clib_error_t * error = 0;
+
+ if (! (pm->flags & PCAP_MAIN_INIT_DONE))
+ {
+ pcap_file_header_t fh;
+ int n;
+
+ if (! pm->file_name)
+ pm->file_name = "/tmp/vnet.pcap";
+
+ pm->file_descriptor = open (pm->file_name, O_CREAT | O_TRUNC | O_WRONLY, 0664);
+ if (pm->file_descriptor < 0)
+ {
+ error = clib_error_return_unix (0, "failed to open `%s'", pm->file_name);
+ goto done;
+ }
+
+ pm->flags |= PCAP_MAIN_INIT_DONE;
+ pm->n_packets_captured = 0;
+ pm->n_pcap_data_written = 0;
+
+ /* Write file header. */
+ memset (&fh, 0, sizeof (fh));
+ fh.magic = 0xa1b2c3d4;
+ fh.major_version = 2;
+ fh.minor_version = 4;
+ fh.time_zone = 0;
+ fh.max_packet_size_in_bytes = 1 << 16;
+ fh.packet_type = pm->packet_type;
+ n = write (pm->file_descriptor, &fh, sizeof (fh));
+ if (n != sizeof (fh))
+ {
+ if (n < 0)
+ error = clib_error_return_unix (0, "write file header `%s'", pm->file_name);
+ else
+ error = clib_error_return (0, "short write of file header `%s'", pm->file_name);
+ goto done;
+ }
+ }
+
+ do {
+ int n = vec_len (pm->pcap_data) - pm->n_pcap_data_written;
+
+ if (n > 0)
+ {
+ n = write (pm->file_descriptor,
+ vec_elt_at_index (pm->pcap_data, pm->n_pcap_data_written),
+ n);
+ if (n < 0 && unix_error_is_fatal (errno))
+ {
+ error = clib_error_return_unix (0, "write `%s'", pm->file_name);
+ goto done;
+ }
+ }
+ pm->n_pcap_data_written += n;
+ if (pm->n_pcap_data_written >= vec_len (pm->pcap_data))
+ {
+ vec_reset_length (pm->pcap_data);
+ break;
+ }
+ } while (pm->n_packets_captured >= pm->n_packets_to_capture);
+
+ if (pm->n_packets_captured >= pm->n_packets_to_capture)
+ {
+ close (pm->file_descriptor);
+ pm->flags &= ~PCAP_MAIN_INIT_DONE;
+ pm->file_descriptor = -1;
+ }
+
+ done:
+ if (error)
+ {
+ if (pm->file_descriptor >= 0)
+ close (pm->file_descriptor);
+ }
+ return error;
+}
+
+clib_error_t * pcap_read (pcap_main_t * pm)
+{
+ clib_error_t * error = 0;
+ int fd, need_swap, n;
+ pcap_file_header_t fh;
+ pcap_packet_header_t ph;
+
+ fd = open (pm->file_name, O_RDONLY);
+ if (fd < 0)
+ {
+ error = clib_error_return_unix (0, "open `%s'", pm->file_name);
+ goto done;
+ }
+
+ if (read (fd, &fh, sizeof (fh)) != sizeof (fh))
+ {
+ error = clib_error_return_unix (0, "read file header `%s'", pm->file_name);
+ goto done;
+ }
+
+ need_swap = 0;
+ if (fh.magic == 0xd4c3b2a1)
+ {
+ need_swap = 1;
+#define _(t,f) fh.f = clib_byte_swap_##t (fh.f);
+ foreach_pcap_file_header;
+#undef _
+ }
+
+ if (fh.magic != 0xa1b2c3d4)
+ {
+ error = clib_error_return (0, "bad magic `%s'", pm->file_name);
+ goto done;
+ }
+
+ pm->min_packet_bytes = 0;
+ pm->max_packet_bytes = 0;
+ while ((n = read (fd, &ph, sizeof (ph))) != 0)
+ {
+ u8 * data;
+
+ if (need_swap)
+ {
+#define _(t,f) ph.f = clib_byte_swap_##t (ph.f);
+ foreach_pcap_packet_header;
+#undef _
+ }
+
+ data = vec_new (u8, ph.n_bytes_in_packet);
+ if (read (fd, data, ph.n_packet_bytes_stored_in_file) != ph.n_packet_bytes_stored_in_file)
+ {
+ error = clib_error_return (0, "short read `%s'", pm->file_name);
+ goto done;
+ }
+
+ if (vec_len (pm->packets_read) == 0)
+ pm->min_packet_bytes = pm->max_packet_bytes = ph.n_bytes_in_packet;
+ else
+ {
+ pm->min_packet_bytes = clib_min (pm->min_packet_bytes, ph.n_bytes_in_packet);
+ pm->max_packet_bytes = clib_max (pm->max_packet_bytes, ph.n_bytes_in_packet);
+ }
+
+ vec_add1 (pm->packets_read, data);
+ }
+
+ done:
+ if (fd >= 0)
+ close (fd);
+ return error;
+
+}
diff --git a/vnet/vnet/unix/pcap.h b/vnet/vnet/unix/pcap.h
new file mode 100644
index 00000000000..6e8e69191f5
--- /dev/null
+++ b/vnet/vnet/unix/pcap.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pcap.h: libpcap packet capture format
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_pcap_h
+#define included_vnet_pcap_h
+
+#include <vlib/vlib.h>
+
+#define foreach_vnet_pcap_packet_type \
+ _ (null, 0) \
+ _ (ethernet, 1) \
+ _ (ppp, 9) \
+ _ (ip, 12) \
+ _ (hdlc, 104)
+
+typedef enum {
+#define _(f,n) PCAP_PACKET_TYPE_##f = (n),
+ foreach_vnet_pcap_packet_type
+#undef _
+} pcap_packet_type_t;
+
+#define foreach_pcap_file_header \
+ /* 0xa1b2c3d4 host byte order. \
+ 0xd4c3b2a1 => need to byte swap everything. */ \
+ _ (u32, magic) \
+ \
+ /* Currently major 2 minor 4. */ \
+ _ (u16, major_version) \
+ _ (u16, minor_version) \
+ \
+ /* 0 for GMT. */ \
+ _ (u32, time_zone) \
+ \
+ /* Accuracy of timestamps. Typically set to 0. */ \
+ _ (u32, sigfigs) \
+ \
+ /* Size of largest packet in file. */ \
+ _ (u32, max_packet_size_in_bytes) \
+ \
+ /* One of vnet_pcap_packet_type_t. */ \
+ _ (u32, packet_type)
+
+/* File header. */
+typedef struct {
+#define _(t, f) t f;
+ foreach_pcap_file_header
+#undef _
+} pcap_file_header_t;
+
+#define foreach_pcap_packet_header \
+ /* Time stamp in seconds and microseconds. */ \
+ _ (u32, time_in_sec) \
+ _ (u32, time_in_usec) \
+ \
+ /* Number of bytes stored in file and size of actual packet. */ \
+ _ (u32, n_packet_bytes_stored_in_file) \
+ _ (u32, n_bytes_in_packet)
+
+/* Packet header. */
+typedef struct {
+#define _(t, f) t f;
+ foreach_pcap_packet_header
+#undef _
+
+ /* Packet data follows. */
+ u8 data[0];
+} pcap_packet_header_t;
+
+typedef struct {
+ /* File name of pcap output. */
+ char * file_name;
+
+ /* Number of packets to capture. */
+ u32 n_packets_to_capture;
+
+ pcap_packet_type_t packet_type;
+
+ /* Number of packets currently captured. */
+ u32 n_packets_captured;
+
+ u32 flags;
+#define PCAP_MAIN_INIT_DONE (1 << 0)
+
+ /* File descriptor for reading/writing. */
+ int file_descriptor;
+
+ u32 n_pcap_data_written;
+
+ /* Vector of pcap data. */
+ u8 * pcap_data;
+
+ /* Packets read from file. */
+ u8 ** packets_read;
+
+ u32 min_packet_bytes, max_packet_bytes;
+} pcap_main_t;
+
+/* Write out data to output file. */
+clib_error_t * pcap_write (pcap_main_t * pm);
+
+clib_error_t * pcap_read (pcap_main_t * pm);
+
+static inline void *
+pcap_add_packet (pcap_main_t * pm,
+ f64 time_now,
+ u32 n_bytes_in_trace,
+ u32 n_bytes_in_packet)
+{
+ pcap_packet_header_t * h;
+ u8 * d;
+
+ vec_add2 (pm->pcap_data, d, sizeof (h[0]) + n_bytes_in_trace);
+ h = (void *) (d);
+ h->time_in_sec = time_now;
+ h->time_in_usec = 1e6*(time_now - h->time_in_sec);
+ h->n_packet_bytes_stored_in_file = n_bytes_in_trace;
+ h->n_bytes_in_packet = n_bytes_in_packet;
+ pm->n_packets_captured++;
+ return h->data;
+}
+
+static inline void
+pcap_add_buffer (pcap_main_t * pm,
+ vlib_main_t * vm, u32 buffer_index,
+ u32 n_bytes_in_trace)
+{
+ vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index);
+ u32 n = vlib_buffer_length_in_chain (vm, b);
+ i32 n_left = clib_min (n_bytes_in_trace, n);
+ f64 time_now = vlib_time_now (vm);
+ void * d;
+
+ d = pcap_add_packet (pm, time_now, n_bytes_in_trace, n_left);
+ while (1)
+ {
+ memcpy (d, b->data + b->current_data, b->current_length);
+ n_left -= b->current_length;
+ if (n_left <= 0)
+ break;
+ d += b->current_length;
+ ASSERT (b->flags & VLIB_BUFFER_NEXT_PRESENT);
+ b = vlib_get_buffer (vm, b->next_buffer);
+ }
+
+ /* Flush output vector. */
+ if (vec_len (pm->pcap_data) >= 64*1024
+ || pm->n_packets_captured >= pm->n_packets_to_capture)
+ pcap_write (pm);
+}
+
+#endif /* included_vnet_pcap_h */
diff --git a/vnet/vnet/unix/pcap2pg.c b/vnet/vnet/unix/pcap2pg.c
new file mode 100644
index 00000000000..10b47c976dc
--- /dev/null
+++ b/vnet/vnet/unix/pcap2pg.c
@@ -0,0 +1,155 @@
+/*
+ * pcap2pg.c: convert pcap input to pg input
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vnet/unix/pcap.h>
+#include <vnet/ethernet/packet.h>
+#include <stdio.h>
+
+pcap_main_t pcap_main;
+
+static char * pg_fmt =
+ "packet-generator new {\n"
+ " name s%d\n"
+ " limit 1\n"
+ " size %d-%d\n"
+ " node ethernet-input\n";
+
+
+void stream_boilerplate (FILE *ofp, int i, u8 * pkt)
+{
+ fformat(ofp, pg_fmt, i, vec_len(pkt), vec_len(pkt));
+}
+
+int pcap2pg (pcap_main_t * pm, FILE *ofp)
+{
+ int i, j;
+ u8 *pkt;
+
+ for (i = 0; i < vec_len (pm->packets_read); i++)
+ {
+ int offset;
+ ethernet_header_t * h;
+ u64 ethertype;
+
+ pkt = pm->packets_read[i];
+ h = (ethernet_header_t *)pkt;
+
+ stream_boilerplate (ofp, i, pkt);
+
+ fformat (ofp, " data {\n");
+
+ ethertype = clib_net_to_host_u16 (h->type);
+
+ /*
+ * In vnet terms, packet generator interfaces are not ethernets.
+ * They don't have vlan tables.
+ * This dance transforms captured 802.1q VLAN packets into
+ * regular Ethernet packets.
+ */
+ if (ethertype == 0x8100 /* 802.1q vlan */)
+ {
+ u16 * vlan_ethertype = (u16 *)(h+1);
+ ethertype = clib_net_to_host_u16(vlan_ethertype[0]);
+ offset = 18;
+ }
+ else
+ offset = 14;
+
+ fformat (ofp,
+ " 0x%04x: %02x%02x.%02x%02x.%02x%02x"
+ " -> %02x%02x.%02x%02x.%02x%02x\n",
+ ethertype,
+ h->src_address[0],
+ h->src_address[1],
+ h->src_address[2],
+ h->src_address[3],
+ h->src_address[4],
+ h->src_address[5],
+ h->dst_address[0],
+ h->dst_address[1],
+ h->dst_address[2],
+ h->dst_address[3],
+ h->dst_address[4],
+ h->dst_address[5]);
+
+ fformat (ofp, " hex 0x");
+
+ for (j = offset; j < vec_len (pkt); j++)
+ fformat (ofp, "%02x", pkt[j]);
+
+ fformat (ofp, " }\n");
+ fformat (ofp, "}\n\n");
+ }
+ return 0;
+}
+
+int main (int argc, char **argv)
+{
+ unformat_input_t input;
+ pcap_main_t * pm = &pcap_main;
+ u8 * input_file = 0, * output_file = 0;
+ FILE * ofp;
+ clib_error_t * error;
+
+ unformat_init_command_line (&input, argv);
+
+ while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat(&input, "-i %s", &input_file)
+ || unformat (&input, "input %s", &input_file))
+ ;
+ else if (unformat (&input, "-o %s", &output_file)
+ || unformat (&input, "output %s", &output_file))
+ ;
+ else
+ {
+ usage:
+ fformat(stderr,
+ "usage: pcap2pg -i <input-file> [-o <output-file>]\n");
+ exit (1);
+ }
+ }
+
+ if (input_file == 0)
+ goto usage;
+
+ pm->file_name = (char *)input_file;
+ error = pcap_read (pm);
+
+ if (error)
+ {
+ clib_error_report (error);
+ exit (1);
+ }
+
+ if (output_file)
+ {
+ ofp = fopen ((char *)output_file, "rw");
+ if (ofp == NULL)
+ clib_unix_warning ("Couldn't create '%s'", output_file);
+ exit (1);
+ }
+ else
+ {
+ ofp = stdout;
+ }
+
+ pcap2pg (pm, ofp);
+
+ fclose (ofp);
+ exit (0);
+}
diff --git a/vnet/vnet/unix/tapcli.c b/vnet/vnet/unix/tapcli.c
new file mode 100644
index 00000000000..44af321f796
--- /dev/null
+++ b/vnet/vnet/unix/tapcli.c
@@ -0,0 +1,1200 @@
+/*
+ *------------------------------------------------------------------
+ * tapcli.c - dynamic tap interface hookup
+ *
+ * Copyright (c) 2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/ethernet/ethernet.h>
+
+#if DPDK == 1
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
+#include <vnet/unix/tapcli.h>
+
+static vnet_device_class_t tapcli_dev_class;
+static vnet_hw_interface_class_t tapcli_interface_class;
+
+static void tapcli_nopunt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame);
+typedef struct {
+ u32 unix_fd;
+ u32 unix_file_index;
+ u32 provision_fd;
+ u32 sw_if_index; /* for counters */
+ u32 hw_if_index;
+ u32 is_promisc;
+ struct ifreq ifr;
+ u32 per_interface_next_index;
+ u8 active; /* for delete */
+} tapcli_interface_t;
+
+typedef struct {
+ /* Vector of iovecs for readv/writev calls. */
+ struct iovec * iovecs;
+
+ /* Vector of VLIB rx buffers to use. We allocate them in blocks
+ of VLIB_FRAME_SIZE (256). */
+ u32 * rx_buffers;
+
+ /* tap device destination MAC address. Required, or Linux drops pkts */
+ u8 ether_dst_mac[6];
+
+ /* Interface MTU in bytes and # of default sized buffers. */
+ u32 mtu_bytes, mtu_buffers;
+
+ /* Vector of tap interfaces */
+ tapcli_interface_t * tapcli_interfaces;
+
+ /* Vector of deleted tap interfaces */
+ u32 * tapcli_inactive_interfaces;
+
+ /* Bitmap of tap interfaces with pending reads */
+ uword * pending_read_bitmap;
+
+ /* Hash table to find tapcli interface given hw_if_index */
+ uword * tapcli_interface_index_by_sw_if_index;
+
+ /* Hash table to find tapcli interface given unix fd */
+ uword * tapcli_interface_index_by_unix_fd;
+
+ /* renumbering table */
+ u32 * show_dev_instance_by_real_dev_instance;
+
+ /* 1 => disable CLI */
+ int is_disabled;
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+ unix_main_t * unix_main;
+} tapcli_main_t;
+
+static tapcli_main_t tapcli_main;
+
+/*
+ * tapcli_tx
+ * Output node, writes the buffers comprising the incoming frame
+ * to the tun/tap device, aka hands them to the Linux kernel stack.
+ *
+ */
+static uword
+tapcli_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_packets = frame->n_vectors;
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t * ti;
+ int i;
+
+ for (i = 0; i < n_packets; i++)
+ {
+ struct iovec * iov;
+ vlib_buffer_t * b;
+ uword l;
+ vnet_hw_interface_t * hw;
+ uword * p;
+ u32 tx_sw_if_index;
+
+ b = vlib_get_buffer (vm, buffers[i]);
+
+ tx_sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_TX];
+ if (tx_sw_if_index == (u32)~0)
+ tx_sw_if_index = vnet_buffer(b)->sw_if_index[VLIB_RX];
+
+ ASSERT(tx_sw_if_index != (u32)~0);
+
+ /* Use the sup intfc to finesse vlan subifs */
+ hw = vnet_get_sup_hw_interface (tm->vnet_main, tx_sw_if_index);
+ tx_sw_if_index = hw->sw_if_index;
+
+ p = hash_get (tm->tapcli_interface_index_by_sw_if_index,
+ tx_sw_if_index);
+ if (p == 0)
+ {
+ clib_warning ("sw_if_index %d unknown", tx_sw_if_index);
+ /* $$$ leak, but this should never happen... */
+ continue;
+ }
+ else
+ ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]);
+
+ /* Re-set iovecs if present. */
+ if (tm->iovecs)
+ _vec_len (tm->iovecs) = 0;
+
+ /* VLIB buffer chain -> Unix iovec(s). */
+ vec_add2 (tm->iovecs, iov, 1);
+ iov->iov_base = b->data + b->current_data;
+ iov->iov_len = l = b->current_length;
+
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ do {
+ b = vlib_get_buffer (vm, b->next_buffer);
+
+ vec_add2 (tm->iovecs, iov, 1);
+
+ iov->iov_base = b->data + b->current_data;
+ iov->iov_len = b->current_length;
+ l += b->current_length;
+ } while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
+ }
+
+ if (writev (ti->unix_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+ clib_unix_warning ("writev");
+ }
+
+ /* interface output path flattens buffer chains */
+ vlib_buffer_free_no_next (vm, buffers, n_packets);
+
+ return n_packets;
+}
+
+VLIB_REGISTER_NODE (tapcli_tx_node,static) = {
+ .function = tapcli_tx,
+ .name = "tapcli-tx",
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .vector_size = 4,
+};
+
+enum {
+ TAPCLI_RX_NEXT_IP4_INPUT,
+ TAPCLI_RX_NEXT_IP6_INPUT,
+ TAPCLI_RX_NEXT_ETHERNET_INPUT,
+ TAPCLI_RX_NEXT_DROP,
+ TAPCLI_RX_N_NEXT,
+};
+
+static uword
+tapcli_rx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ tapcli_main_t * tm = &tapcli_main;
+ vlib_buffer_t * b;
+ u32 bi;
+#if DPDK == 0
+ const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+ u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+#else
+ dpdk_main_t * dm = &dpdk_main;
+ const uword buffer_size = MBUF_SIZE;
+ u32 free_list_index = dm->vlib_buffer_free_list_index;
+#endif
+ static u32 * ready_interface_indices;
+ tapcli_interface_t * ti;
+ int i;
+
+ vec_reset_length (ready_interface_indices);
+
+ clib_bitmap_foreach (i, tm->pending_read_bitmap,
+ ({
+ vec_add1 (ready_interface_indices, i);
+ }));
+
+ if (vec_len (ready_interface_indices) == 0)
+ return 1;
+
+ for (i = 0; i < vec_len(ready_interface_indices); i++)
+ {
+ /* Clear the "interrupt" bit */
+ tm->pending_read_bitmap =
+ clib_bitmap_set (tm->pending_read_bitmap,
+ ready_interface_indices[i], 0);
+
+ ti = vec_elt_at_index (tm->tapcli_interfaces, ready_interface_indices[i]);
+
+ /* Make sure we have some RX buffers. */
+ {
+ uword n_left = vec_len (tm->rx_buffers);
+ uword n_alloc;
+
+ if (n_left < VLIB_FRAME_SIZE / 2)
+ {
+ if (! tm->rx_buffers)
+ vec_alloc (tm->rx_buffers, VLIB_FRAME_SIZE);
+
+ n_alloc = vlib_buffer_alloc_from_free_list
+ (vm, tm->rx_buffers + n_left, VLIB_FRAME_SIZE - n_left,
+ free_list_index);
+ _vec_len (tm->rx_buffers) = n_left + n_alloc;
+ }
+ }
+
+ /* Allocate RX buffers from end of rx_buffers.
+ Turn them into iovecs to pass to readv. */
+ {
+ uword i_rx = vec_len (tm->rx_buffers) - 1;
+ vlib_buffer_t * b;
+ word j, n_bytes_left, n_bytes_in_packet;
+#if DPDK == 1
+ u8 out_of_dpdk_buffers = 0;
+#endif
+
+ /* We need enough buffers left for an MTU sized packet. */
+ if (PREDICT_FALSE(vec_len (tm->rx_buffers) < tm->mtu_buffers))
+ {
+ clib_bitmap_set (tm->pending_read_bitmap,
+ ready_interface_indices[i], 1);
+ clib_warning ("buffer allocation failure");
+ continue;
+ }
+
+ vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+ for (j = 0; j < tm->mtu_buffers; j++)
+ {
+ b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - j]);
+ tm->iovecs[j].iov_base = b->data;
+ tm->iovecs[j].iov_len = buffer_size;
+ }
+
+#if DPDK == 1
+ if (PREDICT_FALSE(out_of_dpdk_buffers == 1))
+ continue;
+#endif
+
+ n_bytes_left = readv (ti->unix_fd, tm->iovecs, tm->mtu_buffers);
+ n_bytes_in_packet = n_bytes_left;
+ if (n_bytes_left <= 0)
+ {
+ if (errno != EAGAIN)
+ clib_unix_warning ("readv %d", n_bytes_left);
+ return 0;
+ }
+
+ bi = tm->rx_buffers[i_rx];
+ while (1)
+ {
+ b = vlib_get_buffer (vm, tm->rx_buffers[i_rx]);
+
+ b->flags = 0;
+ b->current_data = 0;
+ b->current_length = n_bytes_left < buffer_size
+ ? n_bytes_left : buffer_size;
+
+ n_bytes_left -= buffer_size;
+
+ if (n_bytes_left <= 0)
+ {
+#if DPDK == 1
+ struct rte_mbuf *mb = (struct rte_mbuf *)(b - 1);
+ rte_pktmbuf_data_len (mb) = n_bytes_in_packet;
+ rte_pktmbuf_pkt_len (mb) = n_bytes_in_packet;
+#endif
+ break;
+ }
+
+ i_rx--;
+ b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b->next_buffer = tm->rx_buffers[i_rx];
+#if DPDK == 1
+ ASSERT(0); /* $$$$ fixme */
+ /* ((struct rte_pktmbuf *)(b->mb))->next =
+ vlib_get_buffer (vm, tm->rx_buffers[i_rx])->mb; */
+#endif
+ }
+
+ /* Interface counters for tapcli interface. */
+ vlib_increment_combined_counter
+ (vnet_main.interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ ti->sw_if_index,
+ 1, n_bytes_in_packet);
+
+ _vec_len (tm->rx_buffers) = i_rx;
+ }
+
+ b = vlib_get_buffer (vm, bi);
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited... See .../vlib/vlib/buffer.h
+ */
+ VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b);
+
+ {
+ u32 next_index;
+ uword n_trace = vlib_get_trace_count (vm, node);
+
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = ti->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ b->error = node->errors[0];
+
+ {
+ next_index = TAPCLI_RX_NEXT_ETHERNET_INPUT;
+
+ next_index = (ti->per_interface_next_index != ~0) ?
+ ti->per_interface_next_index : next_index;
+ }
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ vnet_sw_interface_t * si;
+ si = vnet_get_sw_interface (vnm, ti->sw_if_index);
+ if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ next_index = TAPCLI_RX_NEXT_DROP;
+ }
+
+
+ vlib_set_next_frame_buffer (vm, node, next_index, bi);
+
+ if (n_trace > 0)
+ {
+ vlib_trace_buffer (vm, node, next_index,
+ b, /* follow_chain */ 1);
+ vlib_set_trace_count (vm, node, n_trace - 1);
+ }
+ }
+ }
+
+ return 1;
+}
+
+static char * tapcli_rx_error_strings[] = {
+ "Interface down",
+};
+
+VLIB_REGISTER_NODE (tapcli_rx_node,static) = {
+ .function = tapcli_rx,
+ .name = "tapcli-rx",
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_INTERRUPT,
+ .vector_size = 4,
+ .n_errors = 1,
+ .error_strings = tapcli_rx_error_strings,
+
+ .n_next_nodes = TAPCLI_RX_N_NEXT,
+ .next_nodes = {
+ [TAPCLI_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [TAPCLI_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [TAPCLI_RX_NEXT_DROP] = "error-drop",
+ [TAPCLI_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ },
+};
+
+/* Gets called when file descriptor is ready from epoll. */
+static clib_error_t * tapcli_read_ready (unix_file_t * uf)
+{
+ vlib_main_t * vm = vlib_get_main();
+ tapcli_main_t * tm = &tapcli_main;
+ uword * p;
+
+ /* Schedule the rx node */
+ vlib_node_set_interrupt_pending (vm, tapcli_rx_node.index);
+
+ p = hash_get (tm->tapcli_interface_index_by_unix_fd, uf->file_descriptor);
+
+ /* Mark the specific tap interface ready-to-read */
+ if (p)
+ tm->pending_read_bitmap = clib_bitmap_set (tm->pending_read_bitmap,
+ p[0], 1);
+ else
+ clib_warning ("fd %d not in hash table", uf->file_descriptor);
+
+ return 0;
+}
+
+static clib_error_t *
+tapcli_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ tapcli_main_t *tm = &tapcli_main;
+#if DPDK == 0
+ const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+#else
+ const uword buffer_size = MBUF_SIZE;
+#endif
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "mtu %d", &tm->mtu_bytes))
+ ;
+ else if (unformat (input, "disable"))
+ tm->is_disabled = 1;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ if (tm->is_disabled)
+ return 0;
+
+ if (geteuid())
+ {
+ clib_warning ("tapcli disabled: must be superuser");
+ tm->is_disabled = 1;
+ return 0;
+ }
+
+ tm->mtu_buffers = (tm->mtu_bytes + (buffer_size - 1)) / buffer_size;
+
+ return 0;
+}
+
+static int tap_name_renumber (vnet_hw_interface_t * hi,
+ u32 new_dev_instance)
+{
+ tapcli_main_t *tm = &tapcli_main;
+
+ vec_validate_init_empty (tm->show_dev_instance_by_real_dev_instance,
+ hi->dev_instance, ~0);
+
+ tm->show_dev_instance_by_real_dev_instance [hi->dev_instance] =
+ new_dev_instance;
+
+ return 0;
+}
+
+VLIB_CONFIG_FUNCTION (tapcli_config, "tapcli");
+
+static void
+tapcli_nopunt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_packets = frame->n_vectors;
+ vlib_buffer_free (vm, buffers, n_packets);
+ vlib_frame_free (vm, node, frame);
+}
+
+VNET_HW_INTERFACE_CLASS (tapcli_interface_class,static) = {
+ .name = "tapcli",
+};
+
+static u8 * format_tapcli_interface_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+ u32 show_dev_instance = ~0;
+ tapcli_main_t * tm = &tapcli_main;
+
+ if (i < vec_len (tm->show_dev_instance_by_real_dev_instance))
+ show_dev_instance = tm->show_dev_instance_by_real_dev_instance[i];
+
+ if (show_dev_instance != ~0)
+ i = show_dev_instance;
+
+ s = format (s, "tap-%d", i);
+ return s;
+}
+
+static u32 tapcli_flag_change (vnet_main_t * vnm,
+ vnet_hw_interface_t * hw,
+ u32 flags)
+{
+ tapcli_main_t *tm = &tapcli_main;
+ tapcli_interface_t *ti;
+ struct ifreq ifr;
+ u32 want_promisc;
+
+ ti = vec_elt_at_index (tm->tapcli_interfaces, hw->dev_instance);
+
+ memcpy (&ifr, &ti->ifr, sizeof (ifr));
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (ti->provision_fd, SIOCGIFFLAGS, &ifr) < 0)
+ {
+ clib_unix_warning ("Couldn't get interface flags for %s", hw->name);
+ return 0;
+ }
+
+ want_promisc = (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) != 0;
+
+ if (want_promisc == ti->is_promisc)
+ return 0;
+
+
+ if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL)
+ ifr.ifr_flags |= IFF_PROMISC;
+ else
+ ifr.ifr_flags &= ~(IFF_PROMISC);
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (ti->provision_fd, SIOCSIFFLAGS, &ifr) < 0)
+ {
+ clib_unix_warning ("Couldn't set interface flags for %s", hw->name);
+ return 0;
+ }
+
+ ti->is_promisc = want_promisc;
+
+ return 0;
+}
+
+static void tapcli_set_interface_next_node (vnet_main_t *vnm,
+ u32 hw_if_index,
+ u32 node_index)
+{
+ tapcli_main_t *tm = &tapcli_main;
+ tapcli_interface_t *ti;
+ vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+ ti = vec_elt_at_index (tm->tapcli_interfaces, hw->dev_instance);
+
+ /* Shut off redirection */
+ if (node_index == ~0)
+ {
+ ti->per_interface_next_index = node_index;
+ return;
+ }
+
+ ti->per_interface_next_index =
+ vlib_node_add_next (tm->vlib_main, tapcli_rx_node.index, node_index);
+}
+
+/*
+ * Mainly exists to set link_state == admin_state
+ * otherwise, e.g. ip6 neighbor discovery breaks
+ */
+static clib_error_t *
+tapcli_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags)
+{
+ uword is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0;
+ u32 hw_flags;
+ u32 speed_duplex = VNET_HW_INTERFACE_FLAG_FULL_DUPLEX
+ | VNET_HW_INTERFACE_FLAG_SPEED_1G;
+
+ if (is_admin_up)
+ hw_flags = VNET_HW_INTERFACE_FLAG_LINK_UP | speed_duplex;
+ else
+ hw_flags = speed_duplex;
+
+ vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags);
+ return 0;
+}
+
+VNET_DEVICE_CLASS (tapcli_dev_class,static) = {
+ .name = "tapcli",
+ .tx_function = tapcli_tx,
+ .format_device_name = format_tapcli_interface_name,
+ .rx_redirect_to_node = tapcli_set_interface_next_node,
+ .name_renumber = tap_name_renumber,
+ .admin_up_down_function = tapcli_interface_admin_up_down,
+};
+
+int vnet_tap_dump_ifs (tapcli_interface_details_t **out_tapids)
+{
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t * ti;
+
+ tapcli_interface_details_t * r_tapids = NULL;
+ tapcli_interface_details_t * tapid = NULL;
+
+ vec_foreach (ti, tm->tapcli_interfaces) {
+ if (!ti->active)
+ continue;
+ vec_add2(r_tapids, tapid, 1);
+ tapid->sw_if_index = ti->sw_if_index;
+ strncpy((char *)tapid->dev_name, ti->ifr.ifr_name, sizeof (ti->ifr.ifr_name)-1);
+ }
+
+ *out_tapids = r_tapids;
+
+ return 0;
+}
+
+/* get tap interface from inactive interfaces or create new */
+static tapcli_interface_t *tapcli_get_new_tapif()
+{
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t *ti = NULL;
+
+ int inactive_cnt = vec_len(tm->tapcli_inactive_interfaces);
+ // if there are any inactive ifaces
+ if (inactive_cnt > 0) {
+ // take last
+ u32 ti_idx = tm->tapcli_inactive_interfaces[inactive_cnt - 1];
+ if (vec_len(tm->tapcli_interfaces) > ti_idx) {
+ ti = vec_elt_at_index (tm->tapcli_interfaces, ti_idx);
+ clib_warning("reusing tap interface");
+ }
+ // "remove" from inactive list
+ _vec_len(tm->tapcli_inactive_interfaces) -= 1;
+ }
+
+ // ti was not retrieved from inactive ifaces - create new
+ if (!ti)
+ vec_add2 (tm->tapcli_interfaces, ti, 1);
+
+ return ti;
+}
+
+int vnet_tap_connect (vlib_main_t * vm, u8 * intfc_name, u8 *hwaddr_arg,
+ u32 * sw_if_indexp)
+{
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t * ti = NULL;
+ struct ifreq ifr;
+ int flags;
+ int dev_net_tun_fd;
+ int dev_tap_fd = -1;
+ clib_error_t * error;
+ u8 hwaddr [6];
+ int rv = 0;
+
+ if (tm->is_disabled)
+ {
+ return VNET_API_ERROR_FEATURE_DISABLED;
+ }
+
+ flags = IFF_TAP | IFF_NO_PI;
+
+ if ((dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0)
+ return VNET_API_ERROR_SYSCALL_ERROR_1;
+
+ memset (&ifr, 0, sizeof (ifr));
+ strncpy(ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1);
+ ifr.ifr_flags = flags;
+ if (ioctl (dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_2;
+ goto error;
+ }
+
+ /* Open a provisioning socket */
+ if ((dev_tap_fd = socket(PF_PACKET, SOCK_RAW,
+ htons(ETH_P_ALL))) < 0 )
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_3;
+ goto error;
+ }
+
+ /* Find the interface index. */
+ {
+ struct ifreq ifr;
+ struct sockaddr_ll sll;
+
+ memset (&ifr, 0, sizeof(ifr));
+ strncpy (ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1);
+ if (ioctl (dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 )
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_4;
+ goto error;
+ }
+
+ /* Bind the provisioning socket to the interface. */
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = ifr.ifr_ifindex;
+ sll.sll_protocol = htons(ETH_P_ALL);
+
+ if (bind(dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_5;
+ goto error;
+ }
+ }
+
+ /* non-blocking I/O on /dev/tapX */
+ {
+ int one = 1;
+ if (ioctl (dev_net_tun_fd, FIONBIO, &one) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_6;
+ goto error;
+ }
+ }
+ ifr.ifr_mtu = tm->mtu_bytes;
+ if (ioctl (dev_tap_fd, SIOCSIFMTU, &ifr) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_7;
+ goto error;
+ }
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_8;
+ goto error;
+ }
+
+ ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+
+ if (ioctl (dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_9;
+ goto error;
+ }
+
+ if (ioctl (dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0)
+ {
+ rv = VNET_API_ERROR_SYSCALL_ERROR_1;
+ goto error;
+ }
+
+ ti = tapcli_get_new_tapif();
+
+ if (hwaddr_arg != 0)
+ memcpy(hwaddr, hwaddr_arg, 6);
+
+ error = ethernet_register_interface
+ (tm->vnet_main,
+ tapcli_dev_class.index,
+ ti - tm->tapcli_interfaces /* device instance */,
+ hwaddr_arg != 0 ? hwaddr :
+ (u8 *) ifr.ifr_hwaddr.sa_data /* ethernet address */,
+ &ti->hw_if_index,
+ tapcli_flag_change);
+
+ if (error)
+ {
+ clib_error_report (error);
+ rv = VNET_API_ERROR_INVALID_REGISTRATION;
+ goto error;
+ }
+
+ {
+ unix_file_t template = {0};
+ template.read_function = tapcli_read_ready;
+ template.file_descriptor = dev_net_tun_fd;
+ ti->unix_file_index = unix_file_add (&unix_main, &template);
+ ti->unix_fd = dev_net_tun_fd;
+ ti->provision_fd = dev_tap_fd;
+ memcpy (&ti->ifr, &ifr, sizeof (ifr));
+ }
+
+ {
+ vnet_hw_interface_t * hw;
+ hw = vnet_get_hw_interface (tm->vnet_main, ti->hw_if_index);
+ ti->sw_if_index = hw->sw_if_index;
+ if (sw_if_indexp)
+ *sw_if_indexp = hw->sw_if_index;
+ }
+
+ ti->active = 1;
+
+ hash_set (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index,
+ ti - tm->tapcli_interfaces);
+
+ hash_set (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd,
+ ti - tm->tapcli_interfaces);
+
+ return rv;
+
+ error:
+ close (dev_net_tun_fd);
+ close (dev_tap_fd);
+
+ return rv;
+}
+
+int vnet_tap_connect_renumber (vlib_main_t * vm, u8 * intfc_name,
+ u8 *hwaddr_arg, u32 * sw_if_indexp,
+ u8 renumber, u32 custom_dev_instance)
+{
+ int rv = vnet_tap_connect(vm, intfc_name, hwaddr_arg, sw_if_indexp);
+
+ if (!rv && renumber)
+ vnet_interface_name_renumber (*sw_if_indexp, custom_dev_instance);
+
+ return rv;
+}
+
+static int tapcli_tap_disconnect (tapcli_interface_t *ti)
+{
+ int rv = 0;
+ vnet_main_t * vnm = vnet_get_main();
+ tapcli_main_t * tm = &tapcli_main;
+ u32 sw_if_index = ti->sw_if_index;
+
+ // bring interface down
+ vnet_sw_interface_set_flags (vnm, sw_if_index, 0);
+
+ if (ti->unix_file_index != ~0) {
+ unix_file_del (&unix_main, unix_main.file_pool + ti->unix_file_index);
+ ti->unix_file_index = ~0;
+ }
+
+ hash_unset (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd);
+ hash_unset (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index);
+ close(ti->unix_fd);
+ close(ti->provision_fd);
+ ti->unix_fd = -1;
+ ti->provision_fd = -1;
+
+ return rv;
+}
+
+int vnet_tap_delete(vlib_main_t *vm, u32 sw_if_index)
+{
+ int rv = 0;
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t *ti;
+ uword *p = NULL;
+
+ p = hash_get (tm->tapcli_interface_index_by_sw_if_index,
+ sw_if_index);
+ if (p == 0) {
+ clib_warning ("sw_if_index %d unknown", sw_if_index);
+ return VNET_API_ERROR_INVALID_SW_IF_INDEX;
+ }
+ ti = vec_elt_at_index (tm->tapcli_interfaces, p[0]);
+
+ // inactive
+ ti->active = 0;
+ tapcli_tap_disconnect(ti);
+ // add to inactive list
+ vec_add1(tm->tapcli_inactive_interfaces, ti - tm->tapcli_interfaces);
+
+ // reset renumbered iface
+ if (p[0] < vec_len (tm->show_dev_instance_by_real_dev_instance))
+ tm->show_dev_instance_by_real_dev_instance[p[0]] = ~0;
+
+ ethernet_delete_interface (tm->vnet_main, ti->hw_if_index);
+ return rv;
+}
+
+static clib_error_t *
+tap_delete_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ tapcli_main_t * tm = &tapcli_main;
+ u32 sw_if_index = ~0;
+
+ if (tm->is_disabled)
+ {
+ return clib_error_return (0, "device disabled...");
+ }
+
+ if (unformat (input, "%U", unformat_vnet_sw_interface, tm->vnet_main,
+ &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+
+ int rc = vnet_tap_delete (vm, sw_if_index);
+
+ if (!rc) {
+ vlib_cli_output (vm, "Deleted.");
+ } else {
+ vlib_cli_output (vm, "Error during deletion of tap interface. (rc: %d)", rc);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (tap_delete_command, static) = {
+ .path = "tap delete",
+ .short_help = "tap delete <vpp-tap-intfc-name>",
+ .function = tap_delete_command_fn,
+};
+
+/* modifies tap interface - can result in new interface being created */
+int vnet_tap_modify (vlib_main_t * vm, u32 orig_sw_if_index,
+ u8 * intfc_name, u8 *hwaddr_arg,
+ u32 * sw_if_indexp,
+ u8 renumber, u32 custom_dev_instance)
+{
+ int rv = vnet_tap_delete (vm, orig_sw_if_index);
+
+ if (rv)
+ return rv;
+
+ rv = vnet_tap_connect_renumber(vm, intfc_name, hwaddr_arg, sw_if_indexp,
+ renumber, custom_dev_instance);
+
+ return rv;
+}
+
+static clib_error_t *
+tap_modify_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u8 * intfc_name;
+ tapcli_main_t * tm = &tapcli_main;
+ u32 sw_if_index = ~0;
+ u32 new_sw_if_index = ~0;
+ int user_hwaddr = 0;
+ u8 hwaddr[6];
+
+ if (tm->is_disabled)
+ {
+ return clib_error_return (0, "device disabled...");
+ }
+
+ if (unformat (input, "%U", unformat_vnet_sw_interface, tm->vnet_main,
+ &sw_if_index))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ if (unformat (input, "%s", &intfc_name))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ if (unformat(input, "hwaddr %U", unformat_ethernet_address,
+ &hwaddr))
+ user_hwaddr = 1;
+
+
+ int rc = vnet_tap_modify (vm, sw_if_index, intfc_name,
+ (user_hwaddr == 1 ? hwaddr : 0),
+ &new_sw_if_index, 0, 0);
+
+ if (!rc) {
+ vlib_cli_output (vm, "Modified %U for Linux tap '%s'",
+ format_vnet_sw_if_index_name, tm->vnet_main,
+ new_sw_if_index, intfc_name);
+ } else {
+ vlib_cli_output (vm, "Error during modification of tap interface. (rc: %d)", rc);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (tap_modify_command, static) = {
+ .path = "tap modify",
+ .short_help = "tap modify <vpp-tap-intfc-name> <linux-intfc-name> [hwaddr [<addr> | random]]",
+ .function = tap_modify_command_fn,
+};
+
+static clib_error_t *
+tap_connect_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ u8 * intfc_name;
+ tapcli_main_t * tm = &tapcli_main;
+ tapcli_interface_t * ti;
+ struct ifreq ifr;
+ int flags;
+ int dev_net_tun_fd;
+ int dev_tap_fd = -1;
+ clib_error_t * error;
+ int user_hwaddr = 0;
+ u8 hwaddr[6];
+
+ if (tm->is_disabled)
+ {
+ return clib_error_return (0, "device disabled...");
+ }
+
+ if (unformat (input, "%s", &intfc_name))
+ ;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ if (unformat(input, "hwaddr %U", unformat_ethernet_address,
+ &hwaddr))
+ user_hwaddr = 1;
+
+ flags = IFF_TAP | IFF_NO_PI;
+
+ if ((dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0)
+ {
+ vlib_cli_output (vm, "Couldn't open /dev/net/tun");
+ return 0;
+ }
+
+ memset (&ifr, 0, sizeof (ifr));
+ strncpy(ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1);
+ ifr.ifr_flags = flags;
+ if (ioctl (dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0)
+ {
+ vlib_cli_output (vm, "Error setting flags on '%s'", intfc_name);
+ goto error;
+ }
+
+ /* Open a provisioning socket */
+ if ((dev_tap_fd = socket(PF_PACKET, SOCK_RAW,
+ htons(ETH_P_ALL))) < 0 )
+ {
+ vlib_cli_output (vm, "Couldn't open provisioning socket");
+ goto error;
+ }
+
+ /* Find the interface index. */
+ {
+ struct ifreq ifr;
+ struct sockaddr_ll sll;
+
+ memset (&ifr, 0, sizeof(ifr));
+ strncpy (ifr.ifr_name, (char *) intfc_name, sizeof (ifr.ifr_name)-1);
+ if (ioctl (dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 )
+ {
+ vlib_cli_output (vm, "Couldn't get if_index");
+ goto error;
+ }
+
+ /* Bind the provisioning socket to the interface. */
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = ifr.ifr_ifindex;
+ sll.sll_protocol = htons(ETH_P_ALL);
+
+ if (bind(dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0)
+ {
+ vlib_cli_output (vm, "Couldn't bind provisioning socket");
+ goto error;
+ }
+ }
+
+ /* non-blocking I/O on /dev/tapX */
+ {
+ int one = 1;
+ if (ioctl (dev_net_tun_fd, FIONBIO, &one) < 0)
+ {
+ vlib_cli_output (0, "Couldn't set device non-blocking flag");
+ goto error;
+ }
+ }
+ ifr.ifr_mtu = tm->mtu_bytes;
+ if (ioctl (dev_tap_fd, SIOCSIFMTU, &ifr) < 0)
+ {
+ vlib_cli_output (0, "Couldn't set device MTU");
+ goto error;
+ }
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0)
+ {
+ vlib_cli_output (0, "Couldn't get interface flags");
+ goto error;
+ }
+
+ ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+
+ if (ioctl (dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0)
+ {
+ vlib_cli_output (0, "Couldn't set intfc admin state up");
+ goto error;
+ }
+
+ if (ioctl (dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0)
+ {
+ vlib_cli_output (0, "Couldn't get intfc MAC address");
+ goto error;
+ }
+
+ ti = tapcli_get_new_tapif();
+ ti->per_interface_next_index = ~0;
+
+ if (unformat(input, "hwaddr random"))
+ {
+ f64 now = vlib_time_now(vm);
+ u32 rnd;
+ rnd = (u32) (now * 1e6);
+ rnd = random_u32 (&rnd);
+
+ memcpy (hwaddr+2, &rnd, sizeof(rnd));
+ hwaddr[0] = 2;
+ hwaddr[1] = 0xfe;
+ user_hwaddr = 1;
+ }
+
+ error = ethernet_register_interface
+ (tm->vnet_main,
+ tapcli_dev_class.index,
+ ti - tm->tapcli_interfaces /* device instance */,
+ user_hwaddr ? hwaddr :
+ (u8 *) ifr.ifr_hwaddr.sa_data /* ethernet address */,
+ &ti->hw_if_index,
+ tapcli_flag_change);
+
+ if (error)
+ clib_error_report (error);
+
+ {
+ unix_file_t template = {0};
+ template.read_function = tapcli_read_ready;
+ template.file_descriptor = dev_net_tun_fd;
+ ti->unix_file_index = unix_file_add (&unix_main, &template);
+ ti->unix_fd = dev_net_tun_fd;
+ ti->provision_fd = dev_tap_fd;
+ memcpy (&ti->ifr, &ifr, sizeof (ifr));
+ }
+
+ {
+ vnet_hw_interface_t * hw;
+ hw = vnet_get_hw_interface (tm->vnet_main, ti->hw_if_index);
+ ti->sw_if_index = hw->sw_if_index;
+ }
+
+ ti->active = 1;
+
+ hash_set (tm->tapcli_interface_index_by_sw_if_index, ti->sw_if_index,
+ ti - tm->tapcli_interfaces);
+
+ hash_set (tm->tapcli_interface_index_by_unix_fd, ti->unix_fd,
+ ti - tm->tapcli_interfaces);
+
+ vlib_cli_output (vm, "Created %U for Linux tap '%s'",
+ format_vnet_sw_if_index_name, tm->vnet_main,
+ ti->sw_if_index, intfc_name);
+
+ return 0;
+
+ error:
+ close (dev_net_tun_fd);
+ close (dev_tap_fd);
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (tap_connect_command, static) = {
+ .path = "tap connect",
+ .short_help = "tap connect <intfc-name> [hwaddr [<addr> | random]]",
+ .function = tap_connect_command_fn,
+};
+
+clib_error_t *
+tapcli_init (vlib_main_t * vm)
+{
+ tapcli_main_t * tm = &tapcli_main;
+
+ tm->vlib_main = vm;
+ tm->vnet_main = vnet_get_main();
+ tm->unix_main = &unix_main;
+ tm->mtu_bytes = 4096 + 256;
+ tm->tapcli_interface_index_by_sw_if_index = hash_create (0, sizeof(uword));
+ tm->tapcli_interface_index_by_unix_fd = hash_create (0, sizeof (uword));
+ vm->os_punt_frame = tapcli_nopunt_frame;
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tapcli_init);
+
+
diff --git a/vnet/vnet/unix/tapcli.h b/vnet/vnet/unix/tapcli.h
new file mode 100644
index 00000000000..1f5f4c3ee73
--- /dev/null
+++ b/vnet/vnet/unix/tapcli.h
@@ -0,0 +1,29 @@
+/*
+ * tapcli.h : tap support
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_tapcli_h__
+#define __included_tapcli_h__
+
+
+typedef struct {
+ u32 sw_if_index;
+ u8 dev_name[64];
+} tapcli_interface_details_t;
+
+int vnet_tap_dump_ifs (tapcli_interface_details_t **out_tapids);
+
+#endif /* __included_tapcli_h__ */
diff --git a/vnet/vnet/unix/tuntap.c b/vnet/vnet/unix/tuntap.c
new file mode 100644
index 00000000000..77c60fd6ee2
--- /dev/null
+++ b/vnet/vnet/unix/tuntap.c
@@ -0,0 +1,907 @@
+/*
+ *------------------------------------------------------------------
+ * tuntap.c - kernel stack (reverse) punt/inject path
+ *
+ * Copyright (c) 2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <fcntl.h> /* for open */
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h> /* for iovec */
+#include <netinet/in.h>
+
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <vnet/ip/ip.h>
+
+#include <vnet/ethernet/ethernet.h>
+
+#if DPDK == 1
+#include <vnet/devices/dpdk/dpdk.h>
+#endif
+
+static vnet_device_class_t tuntap_dev_class;
+static vnet_hw_interface_class_t tuntap_interface_class;
+
+static void tuntap_punt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame);
+static void tuntap_nopunt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame);
+
+/*
+ * This driver runs in one of two distinct modes:
+ * "punt/inject" mode, where we send pkts not otherwise processed
+ * by the forwarding to the Linux kernel stack, and
+ * "normal interface" mode, where we treat the Linux kernel stack
+ * as a peer.
+ *
+ * By default, we select punt/inject mode.
+ */
+
+typedef struct {
+ u32 sw_if_index;
+ u8 is_v6;
+ u8 addr[16];
+} subif_address_t;
+
+typedef struct {
+ /* Vector of iovecs for readv/writev calls. */
+ struct iovec * iovecs;
+
+ /* Vector of VLIB rx buffers to use. We allocate them in blocks
+ of VLIB_FRAME_SIZE (256). */
+ u32 * rx_buffers;
+
+ /* File descriptors for /dev/net/tun and provisioning socket. */
+ int dev_net_tun_fd, dev_tap_fd;
+
+ /* Create a "tap" [ethernet] encaps device */
+ int is_ether;
+
+ /* 1 if a "normal" routed intfc, 0 if a punt/inject interface */
+
+ int have_normal_interface;
+
+ /* tap device destination MAC address. Required, or Linux drops pkts */
+ u8 ether_dst_mac[6];
+
+ /* Interface MTU in bytes and # of default sized buffers. */
+ u32 mtu_bytes, mtu_buffers;
+
+ /* Linux interface name for tun device. */
+ char * tun_name;
+
+ /* Pool of subinterface addresses */
+ subif_address_t *subifs;
+
+ /* Hash for subif addresses */
+ mhash_t subif_mhash;
+
+ u32 unix_file_index;
+
+ /* For the "normal" interface, if configured */
+ u32 hw_if_index, sw_if_index;
+
+} tuntap_main_t;
+
+static tuntap_main_t tuntap_main = {
+ .tun_name = "vnet",
+
+ /* Suitable defaults for an Ethernet-like tun/tap device */
+ .mtu_bytes = 4096 + 256,
+};
+
+/*
+ * tuntap_tx
+ * Output node, writes the buffers comprising the incoming frame
+ * to the tun/tap device, aka hands them to the Linux kernel stack.
+ *
+ */
+static uword
+tuntap_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_packets = frame->n_vectors;
+ tuntap_main_t * tm = &tuntap_main;
+ int i;
+
+ for (i = 0; i < n_packets; i++)
+ {
+ struct iovec * iov;
+ vlib_buffer_t * b;
+ uword l;
+
+ b = vlib_get_buffer (vm, buffers[i]);
+
+ if (tm->is_ether && (!tm->have_normal_interface))
+ {
+ vlib_buffer_reset(b);
+ memcpy (vlib_buffer_get_current (b), tm->ether_dst_mac, 6);
+ }
+
+ /* Re-set iovecs if present. */
+ if (tm->iovecs)
+ _vec_len (tm->iovecs) = 0;
+
+ /* VLIB buffer chain -> Unix iovec(s). */
+ vec_add2 (tm->iovecs, iov, 1);
+ iov->iov_base = b->data + b->current_data;
+ iov->iov_len = l = b->current_length;
+
+ if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ do {
+ b = vlib_get_buffer (vm, b->next_buffer);
+
+ vec_add2 (tm->iovecs, iov, 1);
+
+ iov->iov_base = b->data + b->current_data;
+ iov->iov_len = b->current_length;
+ l += b->current_length;
+ } while (b->flags & VLIB_BUFFER_NEXT_PRESENT);
+ }
+
+ if (writev (tm->dev_net_tun_fd, tm->iovecs, vec_len (tm->iovecs)) < l)
+ clib_unix_warning ("writev");
+ }
+
+ /* The normal interface path flattens the buffer chain */
+ if (tm->have_normal_interface)
+ vlib_buffer_free_no_next (vm, buffers, n_packets);
+ else
+ vlib_buffer_free (vm, buffers, n_packets);
+
+ return n_packets;
+}
+
+VLIB_REGISTER_NODE (tuntap_tx_node,static) = {
+ .function = tuntap_tx,
+ .name = "tuntap-tx",
+ .type = VLIB_NODE_TYPE_INTERNAL,
+ .vector_size = 4,
+};
+
+enum {
+ TUNTAP_RX_NEXT_IP4_INPUT,
+ TUNTAP_RX_NEXT_IP6_INPUT,
+ TUNTAP_RX_NEXT_ETHERNET_INPUT,
+ TUNTAP_RX_NEXT_DROP,
+ TUNTAP_RX_N_NEXT,
+};
+
+static uword
+tuntap_rx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ tuntap_main_t * tm = &tuntap_main;
+ vlib_buffer_t * b;
+ u32 bi;
+#if DPDK == 0
+ const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+ u32 free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+#else
+ dpdk_main_t * dm = &dpdk_main;
+ const uword buffer_size = MBUF_SIZE;
+ u32 free_list_index = dm->vlib_buffer_free_list_index;
+#endif
+
+ /* Make sure we have some RX buffers. */
+ {
+ uword n_left = vec_len (tm->rx_buffers);
+ uword n_alloc;
+
+ if (n_left < VLIB_FRAME_SIZE / 2)
+ {
+ if (! tm->rx_buffers)
+ vec_alloc (tm->rx_buffers, VLIB_FRAME_SIZE);
+
+ n_alloc = vlib_buffer_alloc_from_free_list
+ (vm, tm->rx_buffers + n_left, VLIB_FRAME_SIZE - n_left,
+ free_list_index);
+ _vec_len (tm->rx_buffers) = n_left + n_alloc;
+ }
+ }
+
+ /* Allocate RX buffers from end of rx_buffers.
+ Turn them into iovecs to pass to readv. */
+ {
+ uword i_rx = vec_len (tm->rx_buffers) - 1;
+ vlib_buffer_t * b;
+ word i, n_bytes_left, n_bytes_in_packet;
+
+ /* We should have enough buffers left for an MTU sized packet. */
+ ASSERT (vec_len (tm->rx_buffers) >= tm->mtu_buffers);
+
+ vec_validate (tm->iovecs, tm->mtu_buffers - 1);
+ for (i = 0; i < tm->mtu_buffers; i++)
+ {
+ b = vlib_get_buffer (vm, tm->rx_buffers[i_rx - i]);
+ tm->iovecs[i].iov_base = b->data;
+ tm->iovecs[i].iov_len = buffer_size;
+ }
+
+ n_bytes_left = readv (tm->dev_net_tun_fd, tm->iovecs, tm->mtu_buffers);
+ n_bytes_in_packet = n_bytes_left;
+ if (n_bytes_left <= 0)
+ {
+ if (errno != EAGAIN)
+ clib_unix_warning ("readv %d", n_bytes_left);
+ return 0;
+ }
+
+ bi = tm->rx_buffers[i_rx];
+
+ while (1)
+ {
+#if DPDK == 1
+ struct rte_mbuf * mb;
+#endif
+ b = vlib_get_buffer (vm, tm->rx_buffers[i_rx]);
+#if DPDK == 1
+ mb = (((struct rte_mbuf *)b)-1);
+#endif
+ b->flags = 0;
+ b->current_data = 0;
+ b->current_length = n_bytes_left < buffer_size ? n_bytes_left : buffer_size;
+
+ n_bytes_left -= buffer_size;
+#if DPDK == 1
+ rte_pktmbuf_data_len (mb) = b->current_length;
+#endif
+
+ if (n_bytes_left <= 0)
+ {
+#if DPDK == 1
+ rte_pktmbuf_pkt_len (mb) = n_bytes_in_packet;
+#endif
+ break;
+ }
+
+ i_rx--;
+ b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+ b->next_buffer = tm->rx_buffers[i_rx];
+#if DPDK == 1
+ ASSERT(0);
+ // ((struct rte_pktmbuf *)(b->mb))->next =
+ // vlib_get_buffer (vm, tm->rx_buffers[i_rx])->mb;
+#endif
+ }
+
+ /* Interface counters for tuntap interface. */
+ vlib_increment_combined_counter
+ (vnet_main.interface_main.combined_sw_if_counters
+ + VNET_INTERFACE_COUNTER_RX,
+ os_get_cpu_number(),
+ tm->sw_if_index,
+ 1, n_bytes_in_packet);
+
+ _vec_len (tm->rx_buffers) = i_rx;
+ }
+
+ b = vlib_get_buffer (vm, bi);
+
+ {
+ u32 next_index;
+ uword n_trace = vlib_get_trace_count (vm, node);
+
+ vnet_buffer (b)->sw_if_index[VLIB_RX] = tm->sw_if_index;
+ vnet_buffer (b)->sw_if_index[VLIB_TX] = (u32)~0;
+
+ /*
+ * Turn this on if you run into
+ * "bad monkey" contexts, and you want to know exactly
+ * which nodes they've visited...
+ */
+ if (VLIB_BUFFER_TRACE_TRAJECTORY)
+ b->pre_data[0] = 0;
+
+ b->error = node->errors[0];
+
+ if (tm->is_ether)
+ {
+ next_index = TUNTAP_RX_NEXT_ETHERNET_INPUT;
+ }
+ else
+ switch (b->data[0] & 0xf0)
+ {
+ case 0x40:
+ next_index = TUNTAP_RX_NEXT_IP4_INPUT;
+ break;
+ case 0x60:
+ next_index = TUNTAP_RX_NEXT_IP6_INPUT;
+ break;
+ default:
+ next_index = TUNTAP_RX_NEXT_DROP;
+ break;
+ }
+
+ /* The linux kernel couldn't care less if our interface is up */
+ if (tm->have_normal_interface)
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ vnet_sw_interface_t * si;
+ si = vnet_get_sw_interface (vnm, tm->sw_if_index);
+ if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP))
+ next_index = TUNTAP_RX_NEXT_DROP;
+ }
+
+ vlib_set_next_frame_buffer (vm, node, next_index, bi);
+
+ if (n_trace > 0)
+ {
+ vlib_trace_buffer (vm, node, next_index,
+ b, /* follow_chain */ 1);
+ vlib_set_trace_count (vm, node, n_trace - 1);
+ }
+ }
+
+ return 1;
+}
+
+static char * tuntap_rx_error_strings[] = {
+ "unknown packet type",
+};
+
+VLIB_REGISTER_NODE (tuntap_rx_node,static) = {
+ .function = tuntap_rx,
+ .name = "tuntap-rx",
+ .type = VLIB_NODE_TYPE_INPUT,
+ .state = VLIB_NODE_STATE_INTERRUPT,
+ .vector_size = 4,
+ .n_errors = 1,
+ .error_strings = tuntap_rx_error_strings,
+
+ .n_next_nodes = TUNTAP_RX_N_NEXT,
+ .next_nodes = {
+ [TUNTAP_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
+ [TUNTAP_RX_NEXT_IP6_INPUT] = "ip6-input",
+ [TUNTAP_RX_NEXT_DROP] = "error-drop",
+ [TUNTAP_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ },
+};
+
+/* Gets called when file descriptor is ready from epoll. */
+static clib_error_t * tuntap_read_ready (unix_file_t * uf)
+{
+ vlib_main_t * vm = vlib_get_main();
+ vlib_node_set_interrupt_pending (vm, tuntap_rx_node.index);
+ return 0;
+}
+
+/*
+ * tuntap_exit
+ * Clean up the tun/tap device
+ */
+
+static clib_error_t *
+tuntap_exit (vlib_main_t * vm)
+{
+ tuntap_main_t *tm = &tuntap_main;
+ struct ifreq ifr;
+ int sfd;
+
+ /* Not present. */
+ if (! tm->dev_net_tun_fd || tm->dev_net_tun_fd < 0)
+ return 0;
+
+ sfd = socket (AF_INET, SOCK_STREAM, 0);
+ if (sfd < 0)
+ clib_unix_warning("provisioning socket");
+
+ memset(&ifr, 0, sizeof (ifr));
+ strncpy (ifr.ifr_name, tm->tun_name, sizeof (ifr.ifr_name)-1);
+
+ /* get flags, modify to bring down interface... */
+ if (ioctl (sfd, SIOCGIFFLAGS, &ifr) < 0)
+ clib_unix_warning ("SIOCGIFFLAGS");
+
+ ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
+
+ if (ioctl (sfd, SIOCSIFFLAGS, &ifr) < 0)
+ clib_unix_warning ("SIOCSIFFLAGS");
+
+ /* Turn off persistence */
+ if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 0) < 0)
+ clib_unix_warning ("TUNSETPERSIST");
+ close(tm->dev_tap_fd);
+ close(tm->dev_net_tun_fd);
+ close (sfd);
+
+ return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (tuntap_exit);
+
+static clib_error_t *
+tuntap_config (vlib_main_t * vm, unformat_input_t * input)
+{
+ tuntap_main_t *tm = &tuntap_main;
+ clib_error_t * error = 0;
+ struct ifreq ifr;
+ u8 * name;
+ int flags = IFF_TUN | IFF_NO_PI;
+ int is_enabled = 0, is_ether = 0, have_normal_interface = 0;
+#if DPDK == 0
+ const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+#else
+ const uword buffer_size = MBUF_SIZE;
+#endif
+
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "mtu %d", &tm->mtu_bytes))
+ ;
+ else if (unformat (input, "enable"))
+ is_enabled = 1;
+ else if (unformat (input, "disable"))
+ is_enabled = 0;
+ else if (unformat (input, "ethernet") ||
+ unformat (input, "ether"))
+ is_ether = 1;
+ else if (unformat (input, "have-normal-interface") ||
+ unformat (input, "have-normal"))
+ have_normal_interface = 1;
+ else if (unformat (input, "name %s", &name))
+ tm->tun_name = (char *) name;
+ else
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+ }
+
+ tm->dev_net_tun_fd = -1;
+ tm->dev_tap_fd = -1;
+
+ if (is_enabled == 0)
+ return 0;
+
+ if (geteuid())
+ {
+ clib_warning ("tuntap disabled: must be superuser");
+ return 0;
+ }
+
+ tm->is_ether = is_ether;
+ tm->have_normal_interface = have_normal_interface;
+
+ if (is_ether)
+ flags = IFF_TAP | IFF_NO_PI;
+
+ if ((tm->dev_net_tun_fd = open ("/dev/net/tun", O_RDWR)) < 0)
+ {
+ error = clib_error_return_unix (0, "open /dev/net/tun");
+ goto done;
+ }
+
+ memset (&ifr, 0, sizeof (ifr));
+ strncpy(ifr.ifr_name, tm->tun_name, sizeof(ifr.ifr_name)-1);
+ ifr.ifr_flags = flags;
+ if (ioctl (tm->dev_net_tun_fd, TUNSETIFF, (void *)&ifr) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl TUNSETIFF");
+ goto done;
+ }
+
+ /* Make it persistent, at least until we split. */
+ if (ioctl (tm->dev_net_tun_fd, TUNSETPERSIST, 1) < 0)
+ {
+ error = clib_error_return_unix (0, "TUNSETPERSIST");
+ goto done;
+ }
+
+ /* Open a provisioning socket */
+ if ((tm->dev_tap_fd = socket(PF_PACKET, SOCK_RAW,
+ htons(ETH_P_ALL))) < 0 )
+ {
+ error = clib_error_return_unix (0, "socket");
+ goto done;
+ }
+
+ /* Find the interface index. */
+ {
+ struct ifreq ifr;
+ struct sockaddr_ll sll;
+
+ memset (&ifr, 0, sizeof(ifr));
+ strncpy (ifr.ifr_name, tm->tun_name, sizeof(ifr.ifr_name)-1);
+ if (ioctl (tm->dev_tap_fd, SIOCGIFINDEX, &ifr) < 0 )
+ {
+ error = clib_error_return_unix (0, "ioctl SIOCGIFINDEX");
+ goto done;
+ }
+
+ /* Bind the provisioning socket to the interface. */
+ memset(&sll, 0, sizeof(sll));
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = ifr.ifr_ifindex;
+ sll.sll_protocol = htons(ETH_P_ALL);
+
+ if (bind(tm->dev_tap_fd, (struct sockaddr*) &sll, sizeof(sll)) < 0)
+ {
+ error = clib_error_return_unix (0, "bind");
+ goto done;
+ }
+ }
+
+ /* non-blocking I/O on /dev/tapX */
+ {
+ int one = 1;
+ if (ioctl (tm->dev_net_tun_fd, FIONBIO, &one) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl FIONBIO");
+ goto done;
+ }
+ }
+
+ tm->mtu_buffers = (tm->mtu_bytes + (buffer_size - 1)) / buffer_size;
+
+ ifr.ifr_mtu = tm->mtu_bytes;
+ if (ioctl (tm->dev_tap_fd, SIOCSIFMTU, &ifr) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl SIOCSIFMTU");
+ goto done;
+ }
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (tm->dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl SIOCGIFFLAGS");
+ goto done;
+ }
+
+ ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+
+ if (ioctl (tm->dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl SIOCSIFFLAGS");
+ goto done;
+ }
+
+ if (is_ether)
+ {
+ if (ioctl (tm->dev_tap_fd, SIOCGIFHWADDR, &ifr) < 0)
+ {
+ error = clib_error_return_unix (0, "ioctl SIOCGIFHWADDR");
+ goto done;
+ }
+ else
+ memcpy (tm->ether_dst_mac, ifr.ifr_hwaddr.sa_data, 6);
+ }
+
+ if (have_normal_interface)
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ error = ethernet_register_interface
+ (vnm,
+ tuntap_dev_class.index,
+ 0 /* device instance */,
+ tm->ether_dst_mac /* ethernet address */,
+ &tm->hw_if_index,
+ 0 /* flag change */);
+ if (error)
+ clib_error_report (error);
+ tm->sw_if_index = tm->hw_if_index;
+ vm->os_punt_frame = tuntap_nopunt_frame;
+ }
+ else
+ {
+ vnet_main_t *vnm = vnet_get_main();
+ vnet_hw_interface_t * hi;
+
+ vm->os_punt_frame = tuntap_punt_frame;
+
+ tm->hw_if_index = vnet_register_interface
+ (vnm,
+ tuntap_dev_class.index, 0 /* device instance */,
+ tuntap_interface_class.index, 0);
+ hi = vnet_get_hw_interface (vnm, tm->hw_if_index);
+ tm->sw_if_index = hi->sw_if_index;
+
+ /* Interface is always up. */
+ vnet_hw_interface_set_flags (vnm, tm->hw_if_index,
+ VNET_HW_INTERFACE_FLAG_LINK_UP);
+ vnet_sw_interface_set_flags (vnm, tm->sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+ }
+
+ {
+ unix_file_t template = {0};
+ template.read_function = tuntap_read_ready;
+ template.file_descriptor = tm->dev_net_tun_fd;
+ tm->unix_file_index = unix_file_add (&unix_main, &template);
+ }
+
+ done:
+ if (error)
+ {
+ if (tm->dev_net_tun_fd >= 0)
+ close (tm->dev_net_tun_fd);
+ if (tm->dev_tap_fd >= 0)
+ close (tm->dev_tap_fd);
+ }
+
+ return error;
+}
+
+VLIB_CONFIG_FUNCTION (tuntap_config, "tuntap");
+
+void
+tuntap_ip4_add_del_interface_address (ip4_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip4_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_delete)
+{
+ tuntap_main_t * tm = &tuntap_main;
+ struct ifreq ifr;
+ subif_address_t subif_addr, * ap;
+ uword * p;
+
+ /* Tuntap disabled, or using a "normal" interface. */
+ if (tm->have_normal_interface || tm->dev_tap_fd < 0)
+ return;
+
+ /* See if we already know about this subif */
+ memset (&subif_addr, 0, sizeof (subif_addr));
+ subif_addr.sw_if_index = sw_if_index;
+ memcpy (&subif_addr.addr, address, sizeof (*address));
+
+ p = mhash_get (&tm->subif_mhash, &subif_addr);
+
+ if (p)
+ ap = pool_elt_at_index (tm->subifs, p[0]);
+ else
+ {
+ pool_get (tm->subifs, ap);
+ *ap = subif_addr;
+ mhash_set (&tm->subif_mhash, ap, ap - tm->subifs, 0);
+ }
+
+ /* Use subif pool index to select alias device. */
+ memset (&ifr, 0, sizeof (ifr));
+ snprintf (ifr.ifr_name, sizeof(ifr.ifr_name),
+ "%s:%d", tm->tun_name, (int)(ap - tm->subifs));
+
+ if (! is_delete)
+ {
+ struct sockaddr_in * sin;
+
+ sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+ /* Set ipv4 address, netmask. */
+ sin->sin_family = AF_INET;
+ memcpy (&sin->sin_addr.s_addr, address, 4);
+ if (ioctl (tm->dev_tap_fd, SIOCSIFADDR, &ifr) < 0)
+ clib_unix_warning ("ioctl SIOCSIFADDR");
+
+ sin->sin_addr.s_addr = im->fib_masks[address_length];
+ if (ioctl (tm->dev_tap_fd, SIOCSIFNETMASK, &ifr) < 0)
+ clib_unix_warning ("ioctl SIOCSIFNETMASK");
+ }
+ else
+ {
+ mhash_unset (&tm->subif_mhash, &subif_addr, 0 /* old value ptr */);
+ pool_put (tm->subifs, ap);
+ }
+
+ /* get flags, modify to bring up interface... */
+ if (ioctl (tm->dev_tap_fd, SIOCGIFFLAGS, &ifr) < 0)
+ clib_unix_warning ("ioctl SIOCGIFFLAGS");
+
+ if (is_delete)
+ ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
+ else
+ ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+
+ if (ioctl (tm->dev_tap_fd, SIOCSIFFLAGS, &ifr) < 0)
+ clib_unix_warning ("ioctl SIOCSIFFLAGS");
+}
+
+/*
+ * $$$$ gross workaround for a known #include bug
+ * #include <linux/ipv6.h> causes multiple definitions if
+ * netinet/in.h is also included.
+ */
+struct in6_ifreq {
+ struct in6_addr ifr6_addr;
+ u32 ifr6_prefixlen;
+ int ifr6_ifindex;
+};
+
+/*
+ * Both the v6 interface address API and the way ifconfig
+ * displays subinterfaces differ from their v4 couterparts.
+ * The code given here seems to work but YMMV.
+ */
+void
+tuntap_ip6_add_del_interface_address (ip6_main_t * im,
+ uword opaque,
+ u32 sw_if_index,
+ ip6_address_t * address,
+ u32 address_length,
+ u32 if_address_index,
+ u32 is_delete)
+{
+ tuntap_main_t * tm = &tuntap_main;
+ struct ifreq ifr;
+ struct in6_ifreq ifr6;
+ subif_address_t subif_addr, * ap;
+ uword * p;
+
+ /* Tuntap disabled, or using a "normal" interface. */
+ if (tm->have_normal_interface || tm->dev_tap_fd < 0)
+ return;
+
+ /* See if we already know about this subif */
+ memset (&subif_addr, 0, sizeof (subif_addr));
+ subif_addr.sw_if_index = sw_if_index;
+ subif_addr.is_v6 = 1;
+ memcpy (&subif_addr.addr, address, sizeof (*address));
+
+ p = mhash_get (&tm->subif_mhash, &subif_addr);
+
+ if (p)
+ ap = pool_elt_at_index (tm->subifs, p[0]);
+ else
+ {
+ pool_get (tm->subifs, ap);
+ *ap = subif_addr;
+ mhash_set (&tm->subif_mhash, ap, ap - tm->subifs, 0);
+ }
+
+ /* Use subif pool index to select alias device. */
+ memset (&ifr, 0, sizeof (ifr));
+ memset (&ifr6, 0, sizeof (ifr6));
+ snprintf (ifr.ifr_name, sizeof(ifr.ifr_name),
+ "%s:%d", tm->tun_name, (int)(ap - tm->subifs));
+
+ if (! is_delete)
+ {
+ int sockfd = socket (AF_INET6, SOCK_STREAM, 0);
+ if (sockfd < 0)
+ clib_unix_warning ("get ifindex socket");
+
+ if (ioctl (sockfd, SIOGIFINDEX, &ifr) < 0)
+ clib_unix_warning ("get ifindex");
+
+ ifr6.ifr6_ifindex = ifr.ifr_ifindex;
+ ifr6.ifr6_prefixlen = address_length;
+ memcpy (&ifr6.ifr6_addr, address, 16);
+
+ if (ioctl (sockfd, SIOCSIFADDR, &ifr6) < 0)
+ clib_unix_warning ("set address");
+
+ close (sockfd);
+ }
+ else
+ {
+ int sockfd = socket (AF_INET6, SOCK_STREAM, 0);
+ if (sockfd < 0)
+ clib_unix_warning ("get ifindex socket");
+
+ if (ioctl (sockfd, SIOGIFINDEX, &ifr) < 0)
+ clib_unix_warning ("get ifindex");
+
+ ifr6.ifr6_ifindex = ifr.ifr_ifindex;
+ ifr6.ifr6_prefixlen = address_length;
+ memcpy (&ifr6.ifr6_addr, address, 16);
+
+ if (ioctl (sockfd, SIOCDIFADDR, &ifr6) < 0)
+ clib_unix_warning ("del address");
+
+ close (sockfd);
+
+ mhash_unset (&tm->subif_mhash, &subif_addr, 0 /* old value ptr */);
+ pool_put (tm->subifs, ap);
+ }
+}
+
+static void
+tuntap_punt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ tuntap_tx (vm, node, frame);
+ vlib_frame_free (vm, node, frame);
+}
+
+static void
+tuntap_nopunt_frame (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_packets = frame->n_vectors;
+ vlib_buffer_free (vm, buffers, n_packets);
+ vlib_frame_free (vm, node, frame);
+}
+
+VNET_HW_INTERFACE_CLASS (tuntap_interface_class,static) = {
+ .name = "tuntap",
+};
+
+static u8 * format_tuntap_interface_name (u8 * s, va_list * args)
+{
+ u32 i = va_arg (*args, u32);
+
+ s = format (s, "tuntap-%d", i);
+ return s;
+}
+
+static uword
+tuntap_intfc_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ tuntap_main_t * tm = &tuntap_main;
+ u32 * buffers = vlib_frame_args (frame);
+ uword n_buffers = frame->n_vectors;
+
+ /* Normal interface transmit happens only on the normal interface... */
+ if (tm->have_normal_interface)
+ return tuntap_tx (vm, node, frame);
+
+ vlib_buffer_free (vm, buffers, n_buffers);
+ return n_buffers;
+}
+
+VNET_DEVICE_CLASS (tuntap_dev_class,static) = {
+ .name = "tuntap",
+ .tx_function = tuntap_intfc_tx,
+ .format_device_name = format_tuntap_interface_name,
+};
+
+static clib_error_t *
+tuntap_init (vlib_main_t * vm)
+{
+ clib_error_t * error;
+ ip4_main_t * im4 = &ip4_main;
+ ip6_main_t * im6 = &ip6_main;
+ ip4_add_del_interface_address_callback_t cb4;
+ ip6_add_del_interface_address_callback_t cb6;
+ tuntap_main_t * tm = &tuntap_main;
+
+ error = vlib_call_init_function (vm, ip4_init);
+ if (error)
+ return error;
+
+ mhash_init (&tm->subif_mhash, sizeof (u32), sizeof(subif_address_t));
+
+ cb4.function = tuntap_ip4_add_del_interface_address;
+ cb4.function_opaque = 0;
+ vec_add1 (im4->add_del_interface_address_callbacks, cb4);
+
+ cb6.function = tuntap_ip6_add_del_interface_address;
+ cb6.function_opaque = 0;
+ vec_add1 (im6->add_del_interface_address_callbacks, cb6);
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (tuntap_init);
diff --git a/vnet/vnet/unix/tuntap.h b/vnet/vnet/unix/tuntap.h
new file mode 100644
index 00000000000..ba0b77938e8
--- /dev/null
+++ b/vnet/vnet/unix/tuntap.h
@@ -0,0 +1,37 @@
+/*
+ *------------------------------------------------------------------
+ * tuntap.h - kernel stack (reverse) punt/inject path
+ *
+ * Copyright (c) 2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/*
+ * Call from some VLIB_INIT_FUNCTION to set the Linux kernel
+ * inject node name.
+ */
+void register_tuntap_inject_node_name (char *name);
+
+int vnet_tap_connect (vlib_main_t * vm, u8 * intfc_name,
+ u8 *hwaddr_arg, u32 * sw_if_indexp);
+int vnet_tap_connect_renumber (vlib_main_t * vm, u8 * intfc_name,
+ u8 *hwaddr_arg, u32 * sw_if_indexp,
+ u8 renumber, u32 custom_dev_instance);
+
+int vnet_tap_delete(vlib_main_t *vm, u32 sw_if_index);
+
+int vnet_tap_modify (vlib_main_t * vm, u32 orig_sw_if_index,
+ u8 * intfc_name, u8 *hwaddr_arg,
+ u32 * sw_if_indexp,
+ u8 renumber, u32 custom_dev_instance);
diff --git a/vnet/vnet/vcgn/cgn_bitmap.h b/vnet/vnet/vcgn/cgn_bitmap.h
new file mode 100644
index 00000000000..6c46b75a608
--- /dev/null
+++ b/vnet/vnet/vcgn/cgn_bitmap.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Modifications to this file
+ * Copyright (c) 2006-2009 by cisco Systems, Inc.
+ * All rights reserved.
+ */
+
+/*
+ Copyright (c) 2001, 2002, 2003, 2005 Eliot Dresselhaus
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef __CGN_BITMAP_H__
+#define __CGN_BITMAP_H__
+
+/* Bitmaps built as vectors of machine words. */
+
+#include <string.h>
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/random.h>
+
+#define clib_bitmap_dup(v) vec_dup(v)
+#define clib_bitmap_free(v) vec_free(v)
+#define clib_bitmap_bytes(v) vec_bytes(v)
+#define clib_bitmap_zero(v) vec_zero(v)
+
+/* Allocate bitmap with given number of bits. */
+#define clib_bitmap_alloc(v,n_bits) \
+ v = vec_new (uword, ((n_bits) + BITS (uword) - 1) / BITS (uword))
+
+/* Sets given bit. Returns old value. */
+static inline uword
+cgn_clib_bitmap_set_no_check (uword * a, uword i)
+{
+ uword i0 = i / BITS (a[0]);
+ uword bit = (uword) 1 << (i % BITS (a[0]));
+ uword ai;
+
+/* ASSERT (i0 < vec_len (a)); */
+ ai = a[i0];
+ a[i0] = ai | bit;
+
+ return (ai & bit) != 0;
+}
+
+/* Clears given bit. Returns old value. */
+static inline
+uword cgn_clib_bitmap_clear_no_check (uword * a, uword i)
+{
+ uword i0 = i / BITS (a[0]);
+ uword bit = (uword) 1 << (i % BITS (a[0]));
+ uword ai;
+
+/* ASSERT (i0 < vec_len (a)); */
+ ai = a[i0];
+ a[i0] = ai & ~bit;
+
+ return (ai & bit) != 0;
+}
+
+/* Gets num_bits from ai start at start. assume that all bits are
+ * in the same uword.
+ */
+static inline uword cgn_clib_bitmap_get_bits (uword *ai, u16 start,
+ unsigned char num_bits)
+{
+ uword i0 = start / BITS (ai[0]);
+ uword i1 = start % BITS (ai[0]);
+ uword result = ai[i0] >> i1;
+ if(num_bits >= BITS(ai[0])) return result;
+ /* Else, we have to trim the bits */
+ result = result & (((uword)1 << num_bits) - 1);
+ return result;
+}
+
+/* Check if all of the bits from start to numb_bits are avaiable */
+static inline uword cgn_clib_bitmap_check_if_all (uword *ai, u16 start,
+ i16 num_bits)
+{
+ /* Now check if any bits are zero.. if yes, return false */
+ uword bitmask;
+ if(num_bits >= BITS(ai[0])) {
+ /* assume that its going to be multiples of BUTS(ai[0]) */
+ uword i0 = start / BITS (ai[0]);
+ bitmask = ~0; /* set all bits to 1 */
+ do {
+ if(ai[i0] ^ bitmask) return 0;
+ num_bits = num_bits - BITS (ai[0]);
+ i0++;
+ } while (num_bits > 0);
+ return 1;
+ }
+ else {
+ uword result = cgn_clib_bitmap_get_bits (ai, start, num_bits);
+ bitmask = ((uword)1 << num_bits) -1; /* set only num_bits */
+ return (!(result ^ bitmask));
+ }
+}
+
+#endif
diff --git a/vnet/vnet/vcgn/cgse_defs.h b/vnet/vnet/vcgn/cgse_defs.h
new file mode 100644
index 00000000000..08255875fec
--- /dev/null
+++ b/vnet/vnet/vcgn/cgse_defs.h
@@ -0,0 +1,88 @@
+/*
+ *------------------------------------------------------------------
+ * cgse_defs.h - CGSE specific definiitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CGSE_DEFS_H__
+#define __CGSE_DEFS_H__
+
+#include "spp_platform_common.h"
+#include <cnat_cli.h>
+
+
+#define CGSE_SVI_TYPE_CNAT 1
+#define CGSE_SVI_TYPE_XLAT 2
+#define CGSE_SVI_TYPE_NAT64_STATEFUL 3
+#define CGSE_SVI_TYPE_V6RD 4
+#define CGSE_SVI_TYPE_INFRA 5
+#define CGSE_SVI_TYPE_DS_LITE 7
+#define CGSE_SVI_TYPE_MAPE 9
+
+#define CGSE_SET_TX_PKT_TYPE(type) PLATFORM_SET_CTX_RU_TX_PKT_TYPE(ctx, type)
+
+#define CGSE_INVALID_UIDX 0xffff /*invalid svi app uidb index */
+#define CGSE_INVALID_VRFID 0xffffffff /*invalid vrf id */
+
+#define CGSE_VRF_MASK 0x3fff
+#define CGSE_MAX_VRFMAP_ENTRIES (CGSE_VRF_MASK + 1)
+
+#define CGSE_VRFMAP_ENTRY_INVALID 0xffff
+
+
+#define CGSE_INVALID_CGSE_ID (0)
+
+#define CGSE_TABLE_ENTRY_DELETED 0
+#define CGSE_TABLE_ENTRY_ACTIVE 1
+#define CGSE_TABLE_ENTRY_DORMANT 2
+#define CGSE_TABLE_ENTRY_INVALID_UIDB 3
+
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF1(level, a) \
+ if (cgse_config_debug_level > level) printf(a);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF2(level, a, b) \
+ if (cgse_config_debug_level > level) printf(a, b);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF3(level, a, b, c) \
+ if (cgse_config_debug_level > level) printf(a, b, c);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF4(level, a, b, c, d) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF5(level, a, b, c, d, e) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d, e);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF6(level, a, b, c, d, e, f) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d, e, f);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF7(level, a, b, c, d, e, f, g) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d, e, f, g);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF8(level, a, b, c, d, e, f, g, h) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d, e, f, g, h);
+
+#define CGSE_CONFIG_HANDLER_DEBUG_PRINTF9(level, a, b, c, d, e, f, g, h, i) \
+ if (cgse_config_debug_level > level) printf(a, b, c, d, e, f, g, h, i);
+
+extern u16 *cgse_uidb_index_cgse_id_mapping_ptr;
+
+#define CGSE_ADD_UIDB_INDEX_CGSE_ID_MAPPING(uidb_index, cgse_id) \
+ *(cgse_uidb_index_cgse_id_mapping_ptr + uidb_index) = cgse_id;
+
+extern u8 my_instance_number;
+
+#endif
diff --git a/vnet/vnet/vcgn/cnat_bulk_port.c b/vnet/vnet/vcgn/cnat_bulk_port.c
new file mode 100644
index 00000000000..67ddd255e0f
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_bulk_port.c
@@ -0,0 +1,964 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_bulk_ports.c - wrappers for bulk port allocation
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/bitmap.h>
+
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_logging.h"
+#include "spp_timers.h"
+#include "platform_common.h"
+#include "cgn_bitmap.h"
+#include "spp_platform_trace_log.h"
+#include "cnat_ports.h"
+
+#ifndef NO_BULK_LOGGING
+
+#define PORT_TO_CACHE(y, z) ((y)/(z))
+/* The last bit (MSB) is used to indicate whether the cache entry is full */
+#define CACHE_TO_PORT(x, z) (((x)& 0x7FFF) * (z))
+#define IS_CACHE_ENTRY_FULL(x) ((x) & 0x8000)
+#define MARK_CACHE_ENTRY_AS_FULL(x) ((x) = ((x) | 0x8000))
+#define UNMARK_CACHE_ENTRY_AS_FULL(x) ((x) = ((x) & 0x7FFF))
+#define CACHE_ENTRY_WITHOUT_FULL_STAT(x) ((x) & 0x7FFF)
+
+
+#define NUM_BULK_CHECK 128 /* max number of previous chache to check.
+ * somewhat orbirtrary.. assume 64 as bulk size.. can handle up
+ * to 128*64 ports allocated by a single subscriber */
+
+/* #define DEBUG_BULK_PORT 1 */
+/* #define DEBUG_BULK_PORT_DETAIL 1 */
+#define HAVE_BULK_PORT_STATS 1
+
+#ifdef HAVE_BULK_PORT_STATS
+static uword bulk_cache_hit_count;
+static uword bulk_port_use_count;
+static uword bulk_port_alloc_count;
+static uword mapped_port_alloc_count;
+#endif /* HAVE_BULK_PORT_STATS */
+
+static u32 bulk_port_rand_across;
+
+void show_bulk_port_allocation(u16 in_vrfid, u32 inside_ip)
+{
+ cnat_db_key_bucket_t u_ki;
+ cnat_user_db_entry_t *udb;
+ int i;
+ u32 head;
+ cnat_main_db_entry_t *db = NULL;
+ i16 printed_so_far = 0; /* entries printed so far */
+ u16 prev_bulks[NUM_BULK_CHECK];
+ cnat_vrfmap_t *my_vrfmap = 0;
+ cnat_vrfmap_t *vrfmap = 0;
+ bulk_alloc_size_t bulk_size;
+
+ u_ki.k.k.vrf = in_vrfid;
+ u_ki.k.k.ipv4 = inside_ip;
+ u_ki.k.k.port = 0;
+
+ PLATFORM_DEBUG_PRINT("Searching for user %x in invrf %d\n",
+ inside_ip, in_vrfid);
+ udb = cnat_user_db_lookup_entry(&u_ki);
+ if(!udb) {
+ PLATFORM_DEBUG_PRINT("No such user\n"); return;
+ }
+
+ pool_foreach (vrfmap, cnat_map_by_vrf, ({
+ if(vrfmap->i_vrf == in_vrfid) {
+ my_vrfmap = vrfmap;
+ break;
+ }}));
+
+ if(!my_vrfmap) {
+ PLATFORM_DEBUG_PRINT("Vrf map not found\n");
+ return;
+ }
+ bulk_size = BULKSIZE_FROM_VRFMAP(my_vrfmap);
+
+ if(bulk_size == BULK_ALLOC_SIZE_NONE) {
+ PLATFORM_DEBUG_PRINT("Bulk allocation not enabled\n");
+ return;
+ }
+
+ PLATFORM_DEBUG_PRINT("\nBulk cache for subscriber 0x%x: ", inside_ip);
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ PLATFORM_DEBUG_PRINT("%d , ",
+ CACHE_TO_PORT(udb->bulk_port_range_cache[i], bulk_size));
+ }
+ PLATFORM_DEBUG_PRINT("\nNon cached bulk allocation for subscriber 0x%x:\n",
+ inside_ip);
+ ASSERT(udb);
+ memset(prev_bulks, 0,sizeof(prev_bulks));
+
+ head = udb->translation_list_head_index;
+ if(PREDICT_FALSE(head == EMPTY)) {
+ return;
+ }
+ db = cnat_main_db + head;
+ while (1) {
+ /* skip static ports - static ports may not belong to bulk pool*/
+ if(db->out2in_key.k.port < cnat_static_port_range) goto next_entry;
+
+ u16 bm_index = PORT_TO_CACHE(db->out2in_key.k.port, bulk_size);
+
+ /*Check if we have already tested this bulk */
+ for(i=0; i < printed_so_far; i++) {
+ if(prev_bulks[i] == bm_index) goto next_entry;
+ }
+
+ /*Check if this base port is already part of cache */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(CACHE_ENTRY_WITHOUT_FULL_STAT(udb->bulk_port_range_cache[i])
+ == bm_index) {
+ goto next_entry;
+ }
+ }
+ /* this is not in chache already */
+ PLATFORM_DEBUG_PRINT("%d ", CACHE_TO_PORT(bm_index, bulk_size));
+ if(printed_so_far < NUM_BULK_CHECK) {
+ prev_bulks[printed_so_far] = bm_index;
+ printed_so_far++;
+ }
+
+next_entry:
+ db = cnat_main_db + db->user_ports.next;
+ /*
+ * its a circular list, so if we have reached the head again
+ * all the entries for that user have been read
+ */
+ if (db == (cnat_main_db + head)) {
+ break;
+ }
+ } /* while loop for db entries */
+
+ PLATFORM_DEBUG_PRINT("\n");
+ return;
+}
+
+void show_bulk_port_stats()
+{
+
+ cnat_vrfmap_t *my_vrfmap = 0;
+ PLATFORM_DEBUG_PRINT("Bulk size settings of each inside vrf ...\n");
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ PLATFORM_DEBUG_PRINT("vrf id %d, bulk size %d\n", my_vrfmap->i_vrf,
+ BULKSIZE_FROM_VRFMAP(my_vrfmap));
+ }));
+
+#ifdef HAVE_BULK_PORT_STATS
+ PLATFORM_DEBUG_PRINT("\nBulk port allocation, use and cache hit statistics\n");
+ PLATFORM_DEBUG_PRINT("Number of times bulk ports allocated %lld\n",
+ bulk_port_alloc_count);
+ PLATFORM_DEBUG_PRINT("Number of times pre-allocated ports used %lld\n",
+ bulk_port_use_count);
+ PLATFORM_DEBUG_PRINT(
+ "Number of times pre-allocated bulk port found from cache %lld\n",
+ bulk_cache_hit_count);
+ PLATFORM_DEBUG_PRINT(
+ "Number of times mapped port (static) allocations made %lld\n",
+ mapped_port_alloc_count);
+#else
+ PLATFORM_DEBUG_PRINT("\nNat44 bulk port statistics not turned on\n");
+#endif /* HAVE_BULK_PORT_STATS */
+}
+
+void clear_bulk_port_stats()
+{
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_alloc_count = 0;
+ bulk_port_use_count = 0;
+ bulk_cache_hit_count = 0;
+ mapped_port_alloc_count = 0;
+#endif /* HAVE_BULK_PORT_STATS */
+ return;
+}
+
+void cnat_update_bulk_range_cache(cnat_user_db_entry_t *udb, u16 o_port,
+ bulk_alloc_size_t bulk_size)
+{
+ i16 i;
+ if(!udb) {
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("%s, null udb!\n", __func__);
+#endif
+ return;
+ }
+ if(BULK_ALLOC_SIZE_NONE == bulk_size) { /* no bulk logging */
+ return;
+ }
+
+ /* Take care of caching */
+ if(o_port & 0x1) {
+ o_port--;
+ }
+ if(PREDICT_FALSE(o_port <= 0)) {
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("%s invalid port: %d\n", __func__, o_port);
+#endif
+ return;
+ }
+
+ /* First preference is for the cache entry's that are not used yet */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_FALSE(
+ udb->bulk_port_range_cache[i] == (i16)BULK_RANGE_INVALID)) {
+ udb->bulk_port_range_cache[i] = PORT_TO_CACHE(o_port, bulk_size);
+ return;
+ }
+ }
+
+ /* Now check if any cache entry is full and if it can be replaced */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_FALSE(IS_CACHE_ENTRY_FULL(udb->bulk_port_range_cache[i]))) {
+ udb->bulk_port_range_cache[i] = PORT_TO_CACHE(o_port, bulk_size);
+ return;
+ }
+ }
+
+ return;
+}
+
+
+void cnat_port_free_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ int index,
+ port_pair_t ptype,
+ u16 base_port,
+ cnat_user_db_entry_t *udb,
+ u16 static_port_range,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req)
+{
+ cnat_portmap_v2_t *my_pm;
+ i16 bm_index;
+ i16 i;
+ int unmark_full_status = 0;
+
+ *nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+
+ /* First free up the port */
+ cnat_port_free_v2(pm, index, ptype, base_port, static_port_range);
+ if(BULK_ALLOC_SIZE_NONE == bulk_size) /* no bulk logging */
+ return;
+ if(PREDICT_FALSE(!udb)) {
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("%s udb is null\n", __func__);
+#endif
+ }
+
+ if(PREDICT_FALSE(base_port < static_port_range)) {
+ return;
+ }
+ /* Now check if cache needs to be removed */
+ my_pm = pm + index;
+ base_port = base_port/bulk_size;
+ base_port = base_port * bulk_size; /*Align it to multiples of bulk_size */
+ if(PREDICT_TRUE(!cgn_clib_bitmap_check_if_all(
+ my_pm->bm, base_port, bulk_size))) {
+ *nfv9_log_req = CACHE_ALLOC_NO_LOG_REQUIRED;
+ unmark_full_status = 1;
+ /* One or more ports are still in use */
+ } else {
+ *nfv9_log_req = base_port; /* logging required now. indicate base port*/
+ }
+ bm_index = PORT_TO_CACHE(base_port, bulk_size);
+ /* Now check if this is in the cache */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_FALSE(
+ CACHE_ENTRY_WITHOUT_FULL_STAT(udb->bulk_port_range_cache[i]))
+ == bm_index) {
+ if(unmark_full_status) {
+ /* Unmark full stat.. if it was marked so..*/
+ UNMARK_CACHE_ENTRY_AS_FULL(udb->bulk_port_range_cache[i]);
+ } else {
+ udb->bulk_port_range_cache[i] = (i16)BULK_RANGE_INVALID;
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT(
+ "Clearing cache for client 0x%x, bulk port %d\n",
+ my_pm->ipv4_address, base_port);
+#endif
+ }
+ break;
+ }
+ }
+ return;
+}
+
+
+/* Get suitable port from range */
+static i16 get_suiting_port_pos_from_range(cnat_portmap_v2_t *my_pm,
+ u16 bulk_start, i16 bulk_size, port_pair_t pair_type)
+{
+ i16 num_pos, num_bits, iterations;
+ uword bulk_ports;
+ i16 inc;
+ i16 num_uwords = bulk_size/BITS(my_pm->bm[0]);
+
+ if(PREDICT_FALSE(!num_uwords)) {
+ iterations = 0;
+ num_bits = bulk_size;
+ bulk_size = 0;
+ } else {
+ bulk_port_rand_across = randq1(bulk_port_rand_across);
+ iterations = bulk_port_rand_across % num_uwords;
+ num_bits = BITS(my_pm->bm[0]);
+ }
+
+ do {
+ bulk_ports = cgn_clib_bitmap_get_bits(my_pm->bm,
+ (bulk_start + iterations * BITS(my_pm->bm[0])), num_bits);
+#ifdef DEBUG_BULK_PORT_DETAIL
+ PLATFORM_DEBUG_PRINT("%s %d, bulk start %d, num_bits %d, ports %lld \n",
+ __func__, __LINE__, bulk_start, num_bits, bulk_ports);
+#endif /* DEBUG_BULK_PORT_DETAIL */
+ if(PREDICT_FALSE(!bulk_ports)) goto next_uword;
+ if(PREDICT_TRUE((pair_type == PORT_SINGLE)
+ || (pair_type == PORT_PAIR))) {
+ num_pos =0;
+ inc = 1;
+ } else if(pair_type == PORT_S_ODD) {
+ num_pos = 1;
+ inc = 2;
+ } else if(pair_type == PORT_S_EVEN) {
+ num_pos =0;
+ inc = 2;
+ }
+
+ for(; num_pos < num_bits; num_pos = num_pos + inc) {
+ if(!((bulk_ports >> num_pos) & 1))
+ continue; /* In use */
+ /* Check if the available port meets our
+ * criteria such as add, even, pair etc */
+ else if(PREDICT_FALSE(
+ (pair_type == PORT_PAIR) && ((num_pos & 0x1) ||
+ (!((bulk_ports >> (num_pos + 1)) & 1)))))
+ continue;
+ else break; /* Found one that meets the criteria */
+ }
+ if(num_pos < num_bits)
+ return (num_pos + iterations * BITS(my_pm->bm[0]));
+next_uword:
+ num_bits = BITS(my_pm->bm[0]);
+ bulk_size -= BITS(my_pm->bm[0]);
+ iterations++;
+ if(iterations >= num_uwords) iterations = 0;
+ } while (bulk_size > 0);
+
+ return -2; /* nothing found */
+}
+
+static cnat_errno_t try_bulk_port_from_non_cache(
+ cnat_user_db_entry_t *udb,
+ cnat_portmap_v2_t *my_pm,
+ port_pair_t pair_type,
+ bulk_alloc_size_t bulk_size,
+ u16 *port_available,
+ u16 static_port_range
+ )
+{
+ /****
+ 1. user should have existing translations.. otherwise, we wouldn't get here.
+ 2. For each, get the outside port. get the base port.
+ check if it is already in cache
+ 3. if not, we stand chance.
+ 4. Check for availability from this non cached pool.
+ 5. if found, repalce this with one of the cache that is invalid or full??
+ 6. if we are replacing the cache.. it has to be governed by user
+ preference on prefer oldest pool or prefer newest pool
+ ********/
+ u32 head;
+ cnat_main_db_entry_t *db = NULL;
+ u16 bulk_start; /* start point in 64 bitmap array to search for port */
+ i16 port_pos; /* indicates the position of available port in bulk */
+ i16 i; /* just a counter */
+ i16 attempts_so_far = 0; /* (futile-;) attemps so far..*/
+ u16 prev_bulks[NUM_BULK_CHECK];
+ ASSERT(udb);
+ memset(prev_bulks, 0,sizeof(prev_bulks));
+
+ head = udb->translation_list_head_index;
+ if(PREDICT_FALSE(head == EMPTY)) return CNAT_NO_PRE_ALLOCATED_BULK_PORTS;
+
+ db = cnat_main_db + head;
+ while (1) { //what should be the limit??
+
+ /* skip static ports - static ports may not belong to bulk pool*/
+ if(db->out2in_key.k.port < static_port_range) goto next_entry;
+
+ u16 bm_index = PORT_TO_CACHE(db->out2in_key.k.port, bulk_size);
+
+ /*Check if we have already tested this bulk */
+ for(i=0; i < attempts_so_far; i++) {
+ if(prev_bulks[i] == bm_index) {
+ goto next_entry;
+ }
+ }
+
+ /*Check if this base port is already part of cache */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(CACHE_ENTRY_WITHOUT_FULL_STAT(udb->bulk_port_range_cache[i])
+ == bm_index)
+ goto next_entry;
+ }
+
+ /* this is not in chache already */
+ bulk_start = CACHE_TO_PORT(bm_index, bulk_size);
+ port_pos = get_suiting_port_pos_from_range(my_pm,
+ bulk_start, bulk_size, pair_type);
+
+ if(port_pos < 0) { /* no port available in this range */
+ /* Mark this bulk so that we don't have to try this again */
+ if(attempts_so_far < NUM_BULK_CHECK) {
+ prev_bulks[attempts_so_far] = bm_index;
+ attempts_so_far++;
+ }
+ goto next_entry;
+ }
+
+ /* Got one...Get the port number */
+ *port_available = bulk_start + port_pos;
+
+ /* Check to see if we shoud replace one of the cache */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_FALSE((udb->bulk_port_range_cache[i]
+ == (i16)BULK_RANGE_INVALID) || (
+ IS_CACHE_ENTRY_FULL(udb->bulk_port_range_cache[i])))) {
+ udb->bulk_port_range_cache[i] = bm_index;
+ return CNAT_SUCCESS;
+ }
+ }
+ /* Check to replace an existing (in use) entry */
+ /* TODO: enforce policy */
+ /* order of looping should depend on policy */
+
+ return CNAT_SUCCESS;
+
+next_entry:
+ db = cnat_main_db + db->user_ports.next;
+ /*
+ * its a circular list, so if we have reached the head again
+ * all the entries for that user have been read
+ */
+ if (db == (cnat_main_db + head)) {
+ break;
+ }
+ } /* while loop for db entries */
+ /* no ports available from pre allocated bulk pool */
+ return CNAT_NO_PORT_FROM_BULK;
+}
+
+cnat_errno_t
+cnat_dynamic_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1,
+ u32 *rseed_ip
+ )
+{
+
+ cnat_errno_t rv;
+ u16 port_available = 0;
+ i16 i;
+ cnat_portmap_v2_t *my_pm;
+
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) /* bulk logging enabled */
+ && (udb)) { /* This user does have translations already */
+ u16 bulk_start;
+ i16 port_pos;
+
+ my_pm = pm + *index;
+ /* We have a case to check if bulk allocated ports can be used */
+ /* TODO: order of looping to be based on policy
+ * like prefer older or prefer newer ??
+ * For now, start with most recent cache entry
+ * so that we stand a better chance of
+ * finding a port
+ */
+ for(i= 0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_TRUE((udb->bulk_port_range_cache[i] ==
+ (i16)BULK_RANGE_INVALID) ||
+ IS_CACHE_ENTRY_FULL(udb->bulk_port_range_cache[i]))) {
+ continue; /* This range is not initialized yet or it is full */
+ }
+ bulk_start = CACHE_TO_PORT(udb->bulk_port_range_cache[i],
+ bulk_size);
+ port_pos = get_suiting_port_pos_from_range(my_pm,
+ bulk_start, bulk_size, pair_type);
+ if(PREDICT_FALSE(port_pos < 0)) {
+ /* Mark this cache entry as full so that we do not
+ * waste time on this entry again */
+ MARK_CACHE_ENTRY_AS_FULL(udb->bulk_port_range_cache[i]);
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("Marked bulk cache entry %d as full for %x \n",
+ i, my_pm->ipv4_address);
+#endif /* #ifdef DEBUG_BULK_PORT */
+ continue;
+ }
+ /* Get the port number */
+ port_available = bulk_start+ port_pos;
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT(
+ "Found port from cache : IP 0x%x, port %d %d iterations\n",
+ my_pm->ipv4_address, port_available, i)
+#endif
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_cache_hit_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ break;
+ } /* end of for loop for cache check */
+ /* If we have not found a port yet, check if we can have
+ * pre allocated bulk port from non-cache */
+ if(PREDICT_FALSE(i == BULK_RANGE_CACHE_SIZE)) {
+ if( try_bulk_port_from_non_cache(udb, my_pm, pair_type,
+ bulk_size, &port_available,
+ static_port_range) != CNAT_SUCCESS ) {
+ goto ALLCOATE_NEW_BULK;
+ }
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("Found port from non-cache : IP 0x%x, port %d\n",
+ my_pm->ipv4_address, port_available);
+#endif
+ }
+ /* Assign the port, mark it as in use */
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, port_available);
+ (my_pm->inuse)++;
+ if(PREDICT_FALSE(pair_type == PORT_PAIR)) {/* Mark the next one too */
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, port_available + 1);
+ (my_pm->inuse)++;
+ }
+ *o_ipv4_address = my_pm->ipv4_address;
+ *o_port = port_available;
+ *nfv9_log_req = CACHE_ALLOC_NO_LOG_REQUIRED;
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_use_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ return (CNAT_SUCCESS);
+ }
+ALLCOATE_NEW_BULK:
+#ifdef DEBUG_BULK_PORT
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ PLATFORM_DEBUG_PRINT(
+ "No port available from bulk cache, bulk size %d\n", bulk_size);
+ }
+#endif
+ /* For whatever reason, we have not got a port yet */
+ rv = cnat_dynamic_port_alloc_v2(pm, atype, pair_type, index,
+ o_ipv4_address, o_port, static_port_range, bulk_size, nfv9_log_req,
+ ip_n_to_1, rseed_ip);
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ return rv;
+ }
+ /* Take care of caching */
+ if(PREDICT_FALSE(udb != NULL)) {
+ /* Predict false because, we usually allocate for new users */
+ cnat_update_bulk_range_cache(udb, *o_port, bulk_size);
+ }
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_alloc_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ return (CNAT_SUCCESS);
+}
+
+
+cnat_errno_t
+cnat_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 i_ipv4_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1
+ )
+{
+
+ /***
+ * Requirements -
+ * 1. If the port allocated is below dyn start, it should be individual
+ * port (not bulk)
+ * 2. If NOT, it should be bulk allocated
+ * 3. Try and keep the inside port same as outside port in both the
+ * cases (best effort)
+
+ * Algorithm
+ * 1. Check if it is below stat port start or user is new or bulk is
+ * disabled. If yes, call existing function
+ * 2. If not, see if we can pick from bulk and yet try to keep the port
+ * same - difficult thing - check if the port is free - then check if the
+ * entire bulk is free - if not check if bulk is owned by the user already.
+ * If all of these fail, call existing function to allocate a new bulk
+ * 3. Update cache, etc return log requirements
+ *****/
+
+ cnat_errno_t rv;
+ i16 i;
+ u32 head;
+ cnat_portmap_v2_t *my_pm;
+ uword bit_test_result, start_bit;
+ cnat_main_db_entry_t *db = NULL;
+
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) /* bulk logging enabled */
+ && (udb) && /* This user does have translations already */
+ i_port >= static_port_range ) { /* It is outside stat port range*/
+
+ my_pm = pm + *index;
+ /* We have a case to check if bulk allocated ports can be used */
+
+ /* First check if the required port is available. */
+ if(PREDICT_FALSE(clib_bitmap_get_no_check(my_pm->bm, i_port) == 0)) {
+ goto ALLOCATE_NEW_BULK_STATIC;
+ }
+
+ /* Port is free.. check if the bulk is also free */
+ start_bit= ((i_port/bulk_size) * bulk_size);
+ bit_test_result = cgn_clib_bitmap_check_if_all(my_pm->bm,
+ start_bit, bulk_size);
+ if(PREDICT_TRUE(bit_test_result)) { /* bulk is available, grab it */
+ goto ALLOCATE_NEW_BULK_STATIC;
+ }
+
+ /* else, bulk is taken by someone. check if it is me */
+ /* Check if we own the bulk by any chance */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(udb->bulk_port_range_cache[i] == start_bit) break;
+ }
+ if(i == BULK_RANGE_CACHE_SIZE) { /* no luck with cache */
+ head = udb->translation_list_head_index;
+ if(PREDICT_FALSE(head == EMPTY))
+ goto ALLOCATE_NEW_BULK_STATIC;
+ db = cnat_main_db + head;
+ i = 0;
+ while(1) {
+ if((db->out2in_key.k.port/bulk_size) * bulk_size == start_bit) {
+ i = 1; /* Just to indicate it is found */
+ break;
+ }
+ db = cnat_main_db + db->user_ports.next;
+ /*
+ * its a circular list, so if we have reached the head again
+ * all the entries for that user have been read
+ */
+ if (db == (cnat_main_db + head)) break;
+ } /* while loop for db entries */
+ if(!i) {
+ goto ALLOCATE_NEW_BULK_STATIC;
+ }
+ }
+ /* Assign the port, mark it as in use */
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, i_port);
+ (my_pm->inuse)++;
+ *o_ipv4_address = my_pm->ipv4_address;
+ *o_port = i_port;
+ *nfv9_log_req = CACHE_ALLOC_NO_LOG_REQUIRED;
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_use_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("%s, %d, found stat port from bulk: %x, %d\n",
+ __func__,
+ __LINE__, *o_ipv4_address, *o_port);
+#endif /* DEBUG_BULK_PORT */
+ return (CNAT_SUCCESS);
+ }
+
+ALLOCATE_NEW_BULK_STATIC:
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("%s No port available from bulk cache, bulk size %d\n",
+ __func__,bulk_size);
+#endif
+ /* For whatever reason, we have not got a port yet */
+ rv = cnat_static_port_alloc_v2(pm, atype, pair_type, i_ipv4_address,
+ i_port, index, o_ipv4_address, o_port, static_port_range,
+ bulk_size, nfv9_log_req,ip_n_to_1);
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ return rv;
+ }
+ /* Take care of caching only if it was a bulk alloc */
+ if(PREDICT_FALSE(udb && (BULK_ALLOC_NOT_ATTEMPTED != *nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, *o_port, bulk_size);
+ }
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_alloc_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ return (CNAT_SUCCESS);
+
+}
+
+cnat_errno_t
+cnat_mapped_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ u32 ipv4_address,
+ u16 port,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1
+ )
+{
+ /* Requirements :
+ * 1. Check if bulk allocation is required.
+ * 2. Call cnat_mapped_static_port_alloc_v2 to allocate
+ * 3. Decide if alloc has to be cached
+ * 4. Update nfv9_log_req
+ */
+ cnat_errno_t rv;
+ rv = cnat_mapped_static_port_alloc_v2 (pm,
+ atype, index, ipv4_address, port, nfv9_log_req, bulk_size, ip_n_to_1);
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ return rv;
+ }
+ /* Take care of caching only if it was a bulk alloc */
+ if(PREDICT_FALSE(udb && (BULK_ALLOC_NOT_ATTEMPTED != *nfv9_log_req))) {
+ int i;
+ port = port*bulk_size;
+ port = port/bulk_size; /* align it to bulk size boundary */
+ for(i=0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(CACHE_ENTRY_WITHOUT_FULL_STAT(udb->bulk_port_range_cache[i])
+ == PORT_TO_CACHE(port, bulk_size))
+ break;
+ }
+ if( i == BULK_RANGE_CACHE_SIZE) { /* else, it is alredy in cache */
+ cnat_update_bulk_range_cache(udb, port, bulk_size);
+ }
+ }
+#ifdef HAVE_BULK_PORT_STATS
+ mapped_port_alloc_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ return (CNAT_SUCCESS);
+}
+
+
+cnat_errno_t
+cnat_dynamic_port_alloc_rtsp_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u32 *rseed_ip)
+{
+
+ /***
+ * Algorithm
+ * 1. Compute the range of ports required based on the number of digits
+ * in the port request made by the client.
+ * 2. Check if bulk logging is enabled. If not, use the existing method.
+ * 3. Check if there are 2 adjacent ports available that meet the above
+ * criteria in any of the bulk allocations made already.
+ * 4. If yes, mark them in use and return.
+ * 5. If not allocate a new bulk and pick 2 ports in it
+ ***/
+
+ i16 i;
+ cnat_portmap_v2_t *my_pm = 0;
+ u32 start_port1, end_port1, start_port2, end_port2;
+ int range_loop;
+ u16 bulk_start;
+ i16 port_pos;
+ u16 port_available = 0;
+
+ ASSERT(index);
+ ASSERT(o_ipv4_address);
+ ASSERT(o_port);
+
+ /*
+ * Check if the port is 4 digit or 5 digit. I am assuming we are
+ * not getting 3 (or 2 or 1) digit ports, which we cannot anyway
+ * allocate same sized outside ports - as outside ports start from 1024
+ *
+ * Static Port has its own reserved range. Ensure that the range is
+ * such that atleast few 4 digit ports are available for RTSP. If
+ * not it does not make sense to do special allocation for RTSP.
+ */
+ if (PREDICT_TRUE(static_port_range < MIN_STATIC_PORT_RANGE_FOR_RTSP)) {
+ /*
+ * 4 digit port or less
+ */
+ if (i_port <= 9999) {
+ start_port1 = static_port_range;
+ end_port1 = 9999;
+
+ start_port2 = 10000;
+ end_port2 = PORTS_PER_ADDR - 1;
+ } else { /* 5 digit port */
+ start_port1 = 10000;
+ end_port1 = PORTS_PER_ADDR - 1;
+
+ start_port2 = static_port_range;
+ end_port2 = 9999;
+ }
+ } else { /* Static port range is too big */
+ start_port1 = static_port_range;
+ end_port1 = PORTS_PER_ADDR - 1;
+
+ /*
+ * PORTS_PER_ADDR is just a placeholder for
+ * INVALID_PORT, valid ports are b/w 1 and PORTS_PER_ADDR
+ */
+ start_port2 = PORTS_PER_ADDR;
+ end_port2 = PORTS_PER_ADDR;
+ }
+
+
+ if(PREDICT_TRUE(udb != NULL)) {
+ my_pm = pm + *index;
+ }
+
+ /* Now check if this user already owns a bulk range that is
+ * within start range 1
+ */
+
+ u32 start_range = start_port1;
+ u32 end_range = end_port1;
+ for(range_loop = 0; range_loop < 2; range_loop++) {
+ if((BULK_ALLOC_SIZE_NONE == bulk_size) || (!udb)) {
+ goto ALLOCATE_NEW_RTSP_PORTS;
+ }
+ for(i= 0; i < BULK_RANGE_CACHE_SIZE; i++) {
+ if(PREDICT_TRUE((udb->bulk_port_range_cache[i] ==
+ (i16)BULK_RANGE_INVALID) ||
+ IS_CACHE_ENTRY_FULL(udb->bulk_port_range_cache[i]))) {
+ continue; /* This range is not initialized yet or it is full */
+ }
+
+ bulk_start = CACHE_TO_PORT(udb->bulk_port_range_cache[i],
+ bulk_size);
+ if(bulk_start < start_port1 || bulk_start >= end_port1) {
+ continue; /* Not in the range */
+ }
+
+ port_pos = get_suiting_port_pos_from_range(my_pm,
+ bulk_start, bulk_size, pair_type);
+ if(PREDICT_FALSE(port_pos < 0)) {
+ /* Not Marking this cache entry as full as it failed
+ * for pair type. It might have individual entries
+ */
+ continue;
+ }
+ /* Get the port number */
+ port_available = bulk_start+ port_pos;
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT(
+ "Found port from cache : IP 0x%x, port %d %d iterations\n",
+ my_pm->ipv4_address, port_available, i)
+#endif
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_cache_hit_count += 2;
+#endif /* HAVE_BULK_PORT_STATS */
+ break;
+ } /* end of for loop for cache check */
+
+ if(PREDICT_FALSE(i == BULK_RANGE_CACHE_SIZE)) {
+ /* we have not found a port yet, but to do not want to try
+ * non-cache bulks.. because, it is a very low probability and
+ * do not want to tweak that code for this special case
+ * The impact of non checking the non-cache is, we give this
+ * user few extra ports .. which is OK
+ */
+ goto ALLOCATE_NEW_RTSP_PORTS;
+ }
+#ifdef DEBUG_BULK_PORT
+ PLATFORM_DEBUG_PRINT("RTSP: Found port from non-cache : IP 0x%x, port %d\n",
+ my_pm->ipv4_address, port_available);
+#endif
+
+ /* Assign the port, mark it as in use */
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, port_available);
+ (my_pm->inuse)++;
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, port_available + 1);
+ (my_pm->inuse)++;
+
+ *o_ipv4_address = my_pm->ipv4_address;
+ *o_port = port_available;
+ *nfv9_log_req = CACHE_ALLOC_NO_LOG_REQUIRED;
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_use_count += 2;
+#endif /* HAVE_BULK_PORT_STATS */
+ return (CNAT_SUCCESS);
+
+ALLOCATE_NEW_RTSP_PORTS:
+ /* No luck. Let's try allocating new bulk.. */
+ if(PREDICT_TRUE(CNAT_SUCCESS == cnat_dynamic_port_alloc_rtsp
+ (pm, atype, pair_type,
+ start_range, end_range,index, o_ipv4_address,
+ o_port, bulk_size, nfv9_log_req,rseed_ip))) {
+ if(PREDICT_FALSE(udb &&
+ (BULK_ALLOC_NOT_ATTEMPTED != *nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, *o_port, bulk_size);
+ }
+#ifdef HAVE_BULK_PORT_STATS
+ bulk_port_alloc_count++;
+#endif /* HAVE_BULK_PORT_STATS */
+ return CNAT_SUCCESS;
+ }
+
+ /* Could not allocate in range 1.. so move to range 2. */
+ start_range = start_port2;
+ end_range = end_port2;
+
+ }
+
+ return (CNAT_NOT_FOUND_DIRECT); /* if we are here, we could not get any ports */
+
+}
+
+#else /* Dummy definitions */
+void show_bulk_port_stats()
+{
+ PLATFORM_DEBUG_PRINT("\nBulk logging feature not included\n");
+}
+
+ void clear_bulk_port_stats()
+{
+ PLATFORM_DEBUG_PRINT("\nBulk logging feature not included\n");
+}
+#endif /* NO_BULK_LOGGING */
diff --git a/vnet/vnet/vcgn/cnat_bulk_port.h b/vnet/vnet/vcgn/cnat_bulk_port.h
new file mode 100644
index 00000000000..3e48b9a7794
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_bulk_port.h
@@ -0,0 +1,157 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_bulk_port_defs.h bulk port alloc definitions
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_BULK_PORT_H__
+#define __CNAT_BULK_PORT_H__
+
+#ifndef NO_BULK_LOGGING
+#include "cnat_bulk_port_defs.h"
+
+cnat_errno_t
+cnat_dynamic_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1,
+ u32 *rseed_ip);
+
+void cnat_update_bulk_range_cache(cnat_user_db_entry_t *udb, u16 o_port,
+ bulk_alloc_size_t bulk_size);
+
+void cnat_port_free_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ int index,
+ port_pair_t ptype,
+ u16 base_port,
+ cnat_user_db_entry_t *udb,
+ u16 static_port_range,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req);
+
+cnat_errno_t cnat_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 i_ipv4_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1
+ );
+
+cnat_errno_t cnat_dynamic_port_alloc_rtsp_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u32 *rseed_ip);
+
+cnat_errno_t
+cnat_mapped_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ u32 ipv4_address,
+ u16 port,
+ cnat_user_db_entry_t *udb,
+ bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req,
+ u16 ip_n_to_1
+ );
+
+#else /* NO_BULK_LOGGING */
+/* use older code */
+inline cnat_errno_t
+cnat_dynamic_port_alloc_v2_bulk (
+ cnat_vrfmap_t *vrf_map,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range,
+ u16 ip_n_to_1,
+ u32 *rseed_ip
+ )
+{
+ return cnat_dynamic_port_alloc_v2(vrf_map->portmap_list, atype,
+ pair_type, index, o_ipv4_address, o_port, static_port_range,
+ ip_n_to_1, rseed_ip);
+}
+
+inline void cnat_port_free_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ int index,
+ port_pair_t ptype,
+ u16 base_port,
+ cnat_user_db_entry_t *udb,
+ u16 static_port_range);
+{
+ return cnat_port_free_v2(pm, index, ptype, base_port,
+ static_port_range);
+}
+
+inline cnat_errno_t cnat_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 i_ipv4_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range)
+{
+ return cnat_static_port_alloc_v2 (pm, atype, pair_type,
+ i_ipv4_address, i_port, index, o_ipv4_address, o_port);
+}
+
+inline cnat_errno_t
+cnat_mapped_static_port_alloc_v2_bulk (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ u32 ipv4_address,
+ u16 port)
+{
+ return cnat_mapped_static_port_alloc_v2(pm, atype, index
+ ipv4_address, port);
+}
+
+#endif /* NO_BULK_LOGGING */
+#endif /* __CNAT_BULK_PORT_H__ */
diff --git a/vnet/vnet/vcgn/cnat_bulk_port_defs.h b/vnet/vnet/vcgn/cnat_bulk_port_defs.h
new file mode 100644
index 00000000000..edb47b0a8e1
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_bulk_port_defs.h
@@ -0,0 +1,57 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_bulk_port_defs.h bulk port alloc definitions
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_BULK_PORT_DEFS_H__
+#define __CNAT_BULK_PORT_DEFS_H__
+
+
+#ifndef NO_BULK_LOGGING
+
+typedef enum {
+ BULK_ALLOC_SIZE_NONE = 1,
+ BULK_ALLOC_SIZE_16 = 16,
+ BULK_ALLOC_SIZE_32 = 32,
+ BULK_ALLOC_SIZE_64 = 64,
+ BULK_ALLOC_SIZE_128 = 128,
+ BULK_ALLOC_SIZE_256 = 256,
+ BULK_ALLOC_SIZE_512 = 512,
+ BULK_ALLOC_SIZE_1024 = 1024,
+ BULK_ALLOC_SIZE_2048 = 2048,
+ BULK_ALLOC_SIZE_4096 = 4096
+} bulk_alloc_size_t;
+
+/* #define DEBUG_BULK_PORT 1 TODO: remove this later */
+
+#define CACHE_ALLOC_NO_LOG_REQUIRED -1
+#define BULK_ALLOC_NOT_ATTEMPTED -2
+
+#define BULK_RANGE_INVALID 0xFFFF
+#define BULK_RANGE_CACHE_SIZE 4
+
+#define BULKSIZE_FROM_VRFMAP(vrfmap) ((vrfmap)->bulk_size)
+
+#define INIT_BULK_CACHE(udb) \
+ { \
+ int i; \
+ for(i =0; i < BULK_RANGE_CACHE_SIZE; i++) \
+ (udb)->bulk_port_range_cache[i] = (i16)BULK_RANGE_INVALID; \
+ }
+
+#endif /* NO_BULK_LOGGING */
+#endif /* __CNAT_BULK_PORT_DEFS_H__ */
diff --git a/vnet/vnet/vcgn/cnat_cli.h b/vnet/vnet/vcgn/cnat_cli.h
new file mode 100644
index 00000000000..e9d190a577a
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_cli.h
@@ -0,0 +1,206 @@
+/* *------------------------------------------------------------------
+ * cnat_cli.h - CLI definitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_CLI_H__
+#define __CNAT_CLI_H__
+
+#include "cnat_config_api.h"
+#include "cnat_show_api.h"
+
+/* from iox cli error */
+typedef enum {
+ CNAT_SUCCESS = 0,
+ CNAT_NO_CONFIG,
+ CNAT_NO_VRF_RUN,
+ CNAT_NO_POOL_ANY,
+ CNAT_NO_PORT_ANY,
+#ifndef NO_BULK_LOGGING
+ CNAT_NO_PORT_FROM_BULK,
+ CNAT_NO_PRE_ALLOCATED_BULK_PORTS,
+#endif /* NO_BULK_LOGGING */
+ CNAT_BAD_INUSE_ANY,
+ CNAT_NOT_FOUND_ANY,
+ CNAT_INV_PORT_DIRECT,
+ CNAT_DEL_PORT_DIRECT,
+ CNAT_BAD_INUSE_DIRECT,
+ CNAT_NOT_FOUND_DIRECT,
+ CNAT_OUT_LIMIT,
+ CNAT_MAIN_DB_LIMIT,
+ CNAT_USER_DB_LIMIT,
+ CNAT_NOT_STATIC_PORT,
+ CNAT_BAD_STATIC_PORT_REQ,
+ CNAT_NOT_THIS_CORE,
+ CNAT_ERR_PARSER,
+ CNAT_ERR_INVALID_MSG_ID,
+ CNAT_ERR_INVALID_MSG_SIZE,
+ CNAT_ERR_INVALID_PAYLOAD_SIZE,
+ CNAT_ERR_BAD_TCP_UDP_PORT,
+ CNAT_ERR_BULK_SINGLE_FAILURE,
+ CNAT_ERR_XLAT_ID_INVALID,
+ CNAT_ERR_XLAT_V6_PREFIX_INVALID,
+ CNAT_ERR_XLAT_V4_PREFIX_INVALID,
+ CNAT_ERR_XLAT_TCP_MSS_INVALID,
+ CNAT_ERR_6RD_ID_INVALID,
+ CNAT_ERR_6RD_V4_TUNNEL_SRC_INVALID,
+ CNAT_ERR_6RD_V6_PREFIX_INVALID,
+ CNAT_ERR_6RD_V6_BR_UNICAST_INVALID,
+ CNAT_ERR_6RD_V4_PREFIX_MASK_LEN_INVALID,
+ CNAT_ERR_6RD_V4_SUFFIX_MASK_LEN_INVALID,
+ CNAT_ERR_6RD_V4_COMBO_MASK_LEN_INVALID,
+ CNAT_ERR_6RD_TUNNEL_MTU_INVALID,
+ CNAT_ERR_6RD_TUNNEL_TTL_INVALID,
+ CNAT_ERR_6RD_TUNNEL_TOS_INVALID,
+ CNAT_ERR_NAT64_NO_VRF_RUN,
+ CNAT_ERR_NAT64_ID_INVALID,
+ CNAT_ERR_NAT64_V6_PREFIX_INVALID,
+ CNAT_ERR_NAT64_V4_PREFIX_INVALID,
+ CNAT_ERR_NAT64_TCP_MSS_INVALID,
+#ifdef CGSE_DS_LITE
+ CNAT_ERR_DS_LITE_ID_INVALID,
+#endif /* CGSE_DS_LITE */
+ CNAT_ERR_NO_SESSION_DB,
+ CNAT_ERR_MAPE_ID_INVALID,
+ CNAT_ERR_MAX
+} cnat_errno_t;
+
+#define CNAT_TRUE 1
+#define CNAT_FALSE 0
+
+
+#define CNAT_DEBUG_NONE (0)
+#define CNAT_DEBUG_GLOBAL_ERR (1 << 0)
+#define CNAT_DEBUG_DROP_TCP (1 << 0)
+#define CNAT_DEBUG_DROP_UDP (1 << 1)
+#define CNAT_DEBUG_DROP_ICMP (1 << 2)
+#define CNAT_DEBUG_ERR_TCP (1 << 3)
+#define CNAT_DEBUG_ERR_UDP (1 << 4)
+#define CNAT_DEBUG_ERR_ICMP (1 << 5)
+#define CNAT_DEBUG_ERR_ALG (1 << 6)
+#define CNAT_DEBUG_GLOBAL_ALL (1 << 7)
+#define CNAT_DEBUG_FTP_ALG (1 << 8)
+
+
+
+#define CNAT_DEBUG_ALL 0x1FF /*all of above*/
+#define CNAT_DEBUG_ERR_ALL 0x38
+
+#define CNAT_DB_CLEAR_SPECIFIC (0)
+#define CNAT_DB_CLEAR_ALL (1 << 0)
+#define CNAT_DB_CLEAR_VRF (1 << 1)
+#define CNAT_DB_CLEAR_ADDR (1 << 2)
+#define CNAT_DB_CLEAR_PROTO (1 << 3)
+#define CNAT_DB_CLEAR_PORT (1 << 4)
+
+
+#define MAX_UIDX 0x3fff /*the max svi app uidb index */
+/* address mask per core */
+#define ADDR_MASK_PER_CORE PLATFORM_ADDR_MASK_PER_CORE
+#define ADDR_MASK_PER_CORE_PER_PARTITION \
+ PLATFORM_ADDR_MASK_PER_CORE_PER_PARTITION
+
+#define MAX_CORES PLATFORM_MAX_CORES
+#define MAX_CORES_PER_PARTITION PLATFORM_MAX_CORES_PER_PARTITION
+
+/*
+ * Maximum pool size that is supported by platform
+ */
+#define CNAT_MAX_ADDR_POOL_SIZE PLATFORM_CNAT_MAX_ADDR_POOL_SIZE
+#define CNAT_MAX_ADDR_POOL_SIZE_PER_CORE \
+ (CNAT_MAX_ADDR_POOL_SIZE / MAX_CORES_PER_PARTITION)
+
+#define BOUNDARY_VALUE 256
+
+#define BOUNDARY_VALUE_MASK 0xff
+
+#define NUM_ADDR_IN_RANGE(range, value, instance) \
+ ((range / value) + ((instance % MAX_CORES_PER_PARTITION) < (range%value) ? 1 : 0))
+
+typedef enum {
+ CNAT_DEBUG_FLAGS_DUMP = 0,
+ CNAT_DEBUG_FLAG_UDP_INSIDE_CHECKSUM_DISABLE,
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_CHECKSUM_DISABLE,
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_PKT_DUMP_ENABLE,
+ CNAT_DEBUG_FLAG_UDP_INSIDE_PKT_DUMP_ENABLE,
+ CNAT_DEBUG_FLAG_ICMP_PKT_DUMP_ENABLE,
+ CNAT_DEBUG_FLAG_FRAG_PKT_DUMP_ENABLE,
+ CNAT_DEBUG_FLAG_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_GLOBAL_DEBUG_ALL_ENABLE,
+ CNAT_DEBUG_FLAG_SUMMARY_STATS_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_SHOW_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_XLAT_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_XLAT_DATA_PATH_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_TCP_LOGGING_ENABLE,
+ CNAT_DEBUG_FLAG_NFV9_LOGGING_DUMP_ENABLE,
+ CNAT_DEBUG_FLAG_SYSLOG_LOGGING_DUMP_ENABLE,
+ CNAT_DEBUG_SET_STATIC_PORT_RANGE,
+ CNAT_DEBUG_FLAG_V6RD_DATA_PATH_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_V6RD_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_V6RD_DEFRAG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_NAT64_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_NAT64_DATA_PATH_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_DSLITE_DP_ENABLE,
+ CNAT_DEBUG_FLAG_DSLITE_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_CONFIG_PPTP_ENABLE = 24,
+ CNAT_DEBUG_FLAG_CONFIG_PCP_ENABLE = 25,
+ CNAT_DEBUG_FLAG_MAPE_CONFIG_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_MAPE_DATA_PATH_DEBUG_ENABLE,
+ CNAT_DEBUG_FLAG_MAX,
+} cnat_debug_variable_value;
+
+/*
+ * Don't use too small values for PATH MTU
+ */
+#define MIN_NFV9_PATH_MTU 100
+
+extern u32 global_debug_flag;
+extern u16 debug_i_vrf;
+extern u32 debug_i_flag;
+extern u32 debug_i_addr_start;
+extern u32 debug_i_addr_end;
+extern u16 debug_o_vrf;
+extern u32 debug_o_flag;
+extern u32 debug_o_addr_start;
+extern u32 debug_o_addr_end;
+extern u32 tcp_logging_enable_flag;
+extern u32 nfv9_logging_debug_flag;
+
+extern u32 udp_inside_checksum_disable;
+extern u32 udp_outside_checksum_disable;
+extern u32 udp_inside_packet_dump_enable;
+extern u32 udp_outside_packet_dump_enable;
+
+extern u32 icmp_debug_flag;
+extern u32 frag_debug_flag;
+
+extern u32 summary_stats_debug_flag;
+
+extern u32 config_debug_level;
+extern u32 show_debug_level;
+
+
+/* CLI API prototypes called from vcgn_classify.c */
+extern void cnat_nat44_add_vrf_map_t_handler(spp_api_cnat_v4_add_vrf_map_t *mp,
+ vlib_main_t *vm);
+extern void cnat_nat44_handle_show_stats(vlib_main_t *vm);
+extern void cnat_nat44_handle_show_config(vlib_main_t *vm);
+extern void cnat_nat44_set_protocol_timeout_value(u16 active,
+ u16 init, u8 *proto, u8 reset, vlib_main_t *vm);
+extern void cnat_v4_show_inside_entry_req_t_handler
+(spp_api_cnat_v4_show_inside_entry_req_t *mp, vlib_main_t *vm);
+
+#endif /* __CNAT_CLI_H__ */
diff --git a/vnet/vnet/vcgn/cnat_cli_handler.c b/vnet/vnet/vcgn/cnat_cli_handler.c
new file mode 100644
index 00000000000..d50f522a2e0
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_cli_handler.c
@@ -0,0 +1,947 @@
+/* *------------------------------------------------------------------
+ * cnat_cli_handler.c - CLI handler definitions
+ *
+ * Copyright (c) 2007-2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/error.h>
+
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_cli.h"
+#include "cnat_logging.h"
+#include "cnat_syslog.h"
+#include "cnat_config_api.h"
+#include "cnat_show_api.h"
+#include "cnat_show_response.h"
+
+#include <arpa/inet.h>
+
+u32 show_debug_level = 0;
+
+u32
+cnat_get_vrfmap_nfv9_logging_index (u32 i_vrf_id)
+{
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info = 0;
+ u32 logging_index = EMPTY;
+
+ /*
+ * Start with global logging index if available
+ */
+ if (cnat_nfv9_global_info.cnat_nfv9_init_done) {
+ logging_index = cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ if (my_nfv9_logging_info->i_vrf_id == i_vrf_id) {
+ logging_index = my_nfv9_logging_info -
+ cnat_nfv9_logging_info_pool;
+ break;
+ }
+ }));
+ }
+ return (logging_index);
+}
+
+u32
+cnat_get_vrfmap_syslog_logging_index (u32 i_vrf_id)
+{
+ cnat_syslog_logging_info_t *my_syslog_info = NULL;
+ u32 logging_index = EMPTY;
+
+ /*
+ * Start with global logging index if available
+ */
+ if(PREDICT_TRUE(cnat_syslog_global_info.cnat_syslog_init_done)) {
+
+ pool_foreach (my_syslog_info, cnat_syslog_logging_info_pool, ({
+ if (my_syslog_info->i_vrf_id == i_vrf_id) {
+ logging_index = my_syslog_info -
+ cnat_syslog_logging_info_pool;
+ break;
+ }
+ }));
+ }
+ return (logging_index);
+}
+
+void
+cnat_set_vrf_params_with_default(cnat_vrfmap_t *my_vrfmap, u32 i_vrf, u32 i_vrf_id)
+{
+
+ my_vrfmap->status = S_WAO;
+
+ my_vrfmap->i_vrf = i_vrf;
+ my_vrfmap->i_vrf_id = i_vrf_id;
+
+ my_vrfmap->o_vrf = INVALID_UIDX;
+ my_vrfmap->o_vrf_id = INVALID_VRFID;
+
+#ifndef NO_BULK_LOGGING
+ BULKSIZE_FROM_VRFMAP(my_vrfmap) = BULK_ALLOC_SIZE_NONE;
+#endif /* #ifndef NO_BULK_LOGGING */
+ my_vrfmap->tcp_mss = V4_TCP_MSS_NOT_CONFIGURED_VALUE;
+ my_vrfmap->frag_tout = CNAT_IPV4_FRAG_TIMEOUT_DEF;
+ my_vrfmap->port_limit = V4_DEF_VRF_MAX_PORTS;
+ my_vrfmap->nfv9_logging_index =
+ cnat_get_vrfmap_nfv9_logging_index(i_vrf_id);
+ my_vrfmap->syslog_logging_index =
+ cnat_get_vrfmap_syslog_logging_index(i_vrf_id);
+
+ /* Copy logging policy from nfv9 info. */
+ if(my_vrfmap->nfv9_logging_index != EMPTY) {
+ cnat_nfv9_logging_info_t *nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + my_vrfmap->nfv9_logging_index;
+ my_vrfmap->nf_logging_policy = nfv9_logging_info->logging_policy;
+ }
+ if(my_vrfmap->syslog_logging_index != EMPTY) {
+ cnat_syslog_logging_info_t *syslog_logging_info =
+ cnat_syslog_logging_info_pool + my_vrfmap->syslog_logging_index;
+ my_vrfmap->syslog_logging_policy = syslog_logging_info->logging_policy;
+ }
+ #if 0
+ printf("Initializing params in cnat_set_vrf_params_with_default\n"
+ "my_vrfmap->status = %u\n"
+ "my_vrfmap->tcp_mss = %u\n"
+ "my_vrfmap->i_vrf = %u\n"
+ "my_vrfmap->i_vrf_id = %u\n"
+ "my_vrfmap->o_vrf = %u\n"
+ "my_vrfmap->o_vrf_id = %u\n"
+ "my_vrfmap->bulk_size = %u\n"
+ "my_vrfmap->nfv9_logging_index = %u\n"
+ "my_vrfmap->syslog_logging_index = %u\n"
+ "my_vrfmap->frag_tout = %u\n"
+ "my_vrfmap->port_limit = %u\n"
+ "my_vrfmap->nf_logging_policy = %u\n"
+ "my_vrfmap->syslog_logging_policy = %u\n",
+ my_vrfmap->status,
+ my_vrfmap->tcp_mss,
+ my_vrfmap->i_vrf,
+ my_vrfmap->i_vrf_id,
+ my_vrfmap->o_vrf,
+ my_vrfmap->o_vrf_id,
+ my_vrfmap->bulk_size,
+ my_vrfmap->nfv9_logging_index,
+ my_vrfmap->syslog_logging_index,
+ my_vrfmap->frag_tout,
+ my_vrfmap->port_limit,
+ my_vrfmap->nf_logging_policy,
+ my_vrfmap->syslog_logging_policy);
+ #endif /* if 0 */
+}
+
+/* config command handlers */
+void cnat_nat44_add_vrf_map_t_handler(spp_api_cnat_v4_add_vrf_map_t *mp,
+ vlib_main_t *vm)
+{
+ void cnat_table_entry_fill_map(u32 start_addr, u32 end_addr,
+ cnat_portmap_v2_t **port_map_holder);
+ u32 start_addr, end_addr;
+ u32 pm_len __attribute__((unused));
+ cnat_vrfmap_t *my_vrfmap = 0;
+ cnat_portmap_v2_t *pm = 0;
+ u16 i_vrf, o_vrf;
+ u32 ivrf_id, ovrf_id;
+ u16 my_vrfmap_index;
+ u8 i = 0;
+
+ start_addr = mp->start_addr[0];
+ end_addr = mp->end_addr[0];
+ i_vrf = mp->i_vrf;
+ o_vrf = mp->o_vrf;
+ ovrf_id = mp->o_vrf_id;
+ ivrf_id = mp->i_vrf_id;
+
+#if DEBUG_NOT_COMMENTED
+ vlib_cli_output(vm, "%s: saddr[0x%x], eaddr[0x%x], i_vrf[0x%x], o_vrf[0x%x], "
+ "ovrf_id[0x%x], ivrf_id[0x%x]\n", __func__, start_addr, end_addr,
+ i_vrf, o_vrf, ovrf_id, ivrf_id);
+#endif
+ if (start_addr > end_addr) {
+ vlib_cli_output(vm, "Add VRF Map failed start addr 0x%x > end addr 0x%x\n",
+ start_addr, end_addr);
+ return;
+ }
+ if ((end_addr - start_addr) > CNAT_MAX_ADDR_POOL_SIZE) {
+ vlib_cli_output(vm, "Add VRF Map failed start addr 0x%x - end addr "
+ "0x%x range > 65536\n", start_addr, end_addr);
+ return;
+ }
+ my_vrfmap_index = vrf_map_array[i_vrf];
+
+ if (my_vrfmap_index != VRF_MAP_ENTRY_EMPTY) {
+
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+
+ my_vrfmap->o_vrf = o_vrf;
+ my_vrfmap->i_vrf_id = ivrf_id;
+ my_vrfmap->o_vrf_id = ovrf_id;
+ } else {
+ /*
+ * first time add
+ */
+ pool_get(cnat_map_by_vrf, my_vrfmap);
+ memset(my_vrfmap, 0, sizeof(*my_vrfmap));
+ /* waiting for outside vrf */
+ cnat_set_vrf_params_with_default(my_vrfmap, i_vrf, ivrf_id);
+ my_vrfmap->i_vrf = i_vrf;
+ my_vrfmap->o_vrf = o_vrf;
+ my_vrfmap->i_vrf_id = ivrf_id;
+ my_vrfmap->o_vrf_id = ovrf_id;
+#ifndef NO_BULK_LOGGING
+ BULKSIZE_FROM_VRFMAP(my_vrfmap) = BULK_ALLOC_SIZE_NONE;
+#endif /* #ifndef NO_BULK_LOGGING */
+
+ my_vrfmap->tcp_mss = V4_TCP_MSS_NOT_CONFIGURED_VALUE;
+ my_vrfmap->status = S_WA;
+ my_vrfmap->frag_tout = 0; /* currently setting it to 0 */
+ my_vrfmap->port_limit = V4_DEF_VRF_MAX_PORTS;
+ vrf_map_array[i_vrf] = (my_vrfmap - cnat_map_by_vrf);
+ }
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+ for(i=0; i < 1 ; i++) {
+ start_addr = mp->start_addr[i];
+ end_addr = mp->end_addr[i];
+ if((start_addr == 0) || (end_addr == 0))
+ break;
+
+ cnat_table_entry_fill_map(start_addr, end_addr,
+ &(my_vrfmap->portmap_list));
+ }
+ my_vrfmap->status = S_RUN;
+ vlib_cli_output(vm, "Address Pool Config Successful !!\n");
+ return;
+}
+
+void cnat_nat44_set_protocol_timeout_value(u16 active,
+ u16 init, u8 *proto, u8 reset, vlib_main_t *vm)
+{
+ if (!strncmp((char *) proto, "tcp", 3)) {
+ tcp_initial_setup_timeout = (reset) ? V4_DEF_TCP_IS_TO : init;
+ tcp_active_timeout = (reset) ? V4_DEF_TCP_AS_TO : active;
+
+ } else if (!strncmp((char *) proto, "udp", 3)) {
+ udp_init_session_timeout = (reset) ? V4_DEF_UDP_IS_TO : init;
+ udp_act_session_timeout = (reset) ? V4_DEF_UDP_AS_TO : active;
+
+ } else if (!strncmp((char *) proto, "icmp", 4)) {
+ icmp_session_timeout = (reset) ? V4_DEF_ICMP_S_TO : active;
+
+ } else {
+ vlib_cli_output(vm, "Error !! Unsupported protocol %s\n", proto);
+ }
+ return;
+}
+
+
+
+
+/* Show command handlers */
+void cnat_nat44_handle_show_stats(vlib_main_t *vm)
+{
+ pool_header_t *h;
+ u32 used, free;
+ cnat_vrfmap_t *my_vrfmap =0;
+ cnat_portmap_v2_t *pm =0, *my_pm = 0;
+ u32 i, pm_len;
+ struct in_addr ip;
+ void cnat_nfv9_show_collector
+ (vlib_main_t *vm, cnat_nfv9_logging_info_t *my_nfv9_logging_info);
+
+ /* active translations */
+ h = pool_header(cnat_main_db);
+ free = vec_len(h->free_indices);
+ used = vec_len(cnat_main_db) - free;
+
+ vlib_cli_output(vm, "vCGN NAT44 Statistics :\n");
+ vlib_cli_output(vm, "\tActive Translations : %u\n",
+ NAT44_COMMON_STATS.active_translations);
+ vlib_cli_output(vm, "\tTotal free address : %u\n", free);
+ vlib_cli_output(vm, "\tTotal used address : %u\n", used);
+ vlib_cli_output(vm, "\ti2o drops due to port limit exceeded : %lu\n",
+ in2out_drops_port_limit_exceeded);
+ vlib_cli_output(vm, "\ti2o drops due to system limit reached : %lu\n",
+ in2out_drops_system_limit_reached);
+ vlib_cli_output(vm, "\ti2o drops due to resource depletion : %lu\n",
+ in2out_drops_resource_depletion);
+ vlib_cli_output(vm, "\ti2o drops due to no translations : %lu\n",
+ NAT44_COMMON_STATS.no_translation_entry_drops);
+
+ vlib_cli_output(vm, "\tPool address usage:\n");
+ vlib_cli_output(vm, "\t-------------------------------------------------\n");
+ vlib_cli_output(vm, "\tExternal Address \tPorts Used\n");
+ vlib_cli_output(vm, "\t-------------------------------------------------\n");
+
+ used = 0;
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+ if (my_pm->inuse) {
+ used++;
+ /* maximum of 200 addresses to be returned */
+ if (used <= 200) {
+ ip.s_addr = ntohl(my_pm->ipv4_address);
+ vlib_cli_output(vm, "\t%s \t\t%u\n", inet_ntoa(ip), my_pm->inuse);
+ }
+ }
+ }
+ }));
+ return;
+}
+
+void cnat_nat44_handle_show_config(vlib_main_t *vm)
+{
+ cnat_vrfmap_t * my_vrfmap;
+ cnat_portmap_v2_t *pm = 0;
+ cnat_portmap_v2_t *my_pm = 0;
+ u32 pm_len;
+ struct in_addr ip_addr;
+ u8 status_str[20];
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info,
+ *global_nfv9_logging_info = 0;
+ void cnat_nfv9_show_collector
+ (vlib_main_t *vm, cnat_nfv9_logging_info_t *my_nfv9_logging_info);
+
+ vlib_cli_output(vm, "vCGN NAT44 Config:\n");
+ vlib_cli_output(vm, "\tPort Limit : %u\n", cnat_main_db_max_ports_per_user);
+ vlib_cli_output(vm, "\ttotal address pool : %u\n", total_address_pool_allocated);
+ vlib_cli_output(vm, "\tdynamic port start range : %u\n", cnat_static_port_range);
+
+ pool_foreach(my_vrfmap, cnat_map_by_vrf, ({
+ vlib_cli_output(vm, "\ti-intf-index : 0x%x\n", my_vrfmap->i_vrf);
+ vlib_cli_output(vm, "\to-intf-index : 0x%x\n", my_vrfmap->o_vrf);
+
+ memset(status_str, 0x00, sizeof(status_str));
+ switch(my_vrfmap->status) {
+ case S_WAO: memcpy(status_str, "S_WAO", 5); break;
+ case S_WA: memcpy(status_str, "S_WA", 4); break;
+ case S_WO: memcpy(status_str, "S_WO", 4); break;
+ case S_RUN: memcpy(status_str, "S_RUN", 5); break;
+ case S_DEL: memcpy(status_str, "S_DEL", 5); break;
+ default: memcpy(status_str, "Invalid state", 13);
+
+ }
+ vlib_cli_output(vm,
+ "\tvrf map table status : %s\n", status_str);
+
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+ my_pm = pm;
+ ip_addr.s_addr = clib_net_to_host_u32(my_pm->ipv4_address);
+ vlib_cli_output(vm,
+ "\tStart Address : %s\n", inet_ntoa(ip_addr));
+ my_pm = pm + (pm_len - 1);
+ ip_addr.s_addr = clib_net_to_host_u32(my_pm->ipv4_address);
+ vlib_cli_output(vm,
+ "\tEnd Address : %s\n", inet_ntoa(ip_addr));
+
+ }));
+ vlib_cli_output(vm,
+ "\ttcp init timeout : %u sec\n", tcp_initial_setup_timeout);
+ vlib_cli_output(vm,
+ "\ttcp active timeout : %u sec\n", tcp_active_timeout);
+ vlib_cli_output(vm,
+ "\tudp init timeout : %u sec\n", udp_init_session_timeout);
+ vlib_cli_output(vm,
+ "\tudp active timeout : %u sec\n", udp_act_session_timeout);
+ vlib_cli_output(vm,
+ "\ticmp session timeout: %u sec\n", icmp_session_timeout);
+
+#if 0
+ if (cnat_nfv9_global_info.cnat_nfv9_global_collector_index != EMPTY) {
+ vlib_cli_output(vm,"\nGloabal NFV9 Collector :");
+ global_nfv9_logging_info = cnat_nfv9_logging_info_pool +
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+ cnat_nfv9_show_collector(vm, global_nfv9_logging_info);
+ }
+#endif
+
+ vlib_cli_output(vm, "\nNFV9 Collector :");
+ if (cnat_nfv9_logging_info_pool !=NULL) {
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ if (my_nfv9_logging_info != global_nfv9_logging_info) {
+ cnat_nfv9_show_collector(vm, my_nfv9_logging_info);
+ vlib_cli_output(vm, "\n");
+ }
+ }));
+ } else {
+ vlib_cli_output(vm, "\n");
+ }
+
+ return;
+}
+
+/*
+ * Check if the request flag matches the entry flags and
+ * if so return "1"
+ *
+ * entry_flag_ptr is an output parameter - it returns the flags
+ * corresponding to the translation entry
+ */
+static u8 cnat_v4_show_verify_display_entry (
+ u16 request_flag,
+ cnat_main_db_entry_t *db,
+ u16 *entry_flag_ptr)
+{
+ u8 display_entry = 0;
+
+ /*
+ * This should never happen
+ */
+ if (!entry_flag_ptr) {
+ return (display_entry);
+ }
+
+ *entry_flag_ptr = 0;
+
+ if ((db->flags & CNAT_DB_FLAG_STATIC_PORT)
+ &&(db->flags & CNAT_DB_FLAG_ALG_ENTRY)) {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_STATIC;
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_ALG;
+ } else if (db->flags & CNAT_DB_FLAG_STATIC_PORT) {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_STATIC;
+ } else if ((db->flags & CNAT_DB_FLAG_ALG_ENTRY) ||
+ (db->flags & CNAT_DB_FLAG_PPTP_GRE_ENTRY)) {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_ALG;
+ } else if (db->flags & CNAT_DB_FLAG_PCPI) {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_PCPI_DYNAMIC;
+ } else if (db->flags & CNAT_DB_FLAG_PCPE) {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_PCPE_DYNAMIC;
+ } else {
+ *entry_flag_ptr |= CNAT_TRANSLATION_ENTRY_DYNAMIC;
+ }
+
+ if (request_flag == CNAT_TRANSLATION_ENTRY_ALL) {
+ display_entry = 1;
+ } else {
+ /*
+ * Check if the request_flag is STATIC or ALG
+ * and the entry is STATIC or ALG as well
+ */
+ if ((request_flag & CNAT_TRANSLATION_ENTRY_STATIC) &&
+ (*entry_flag_ptr & CNAT_TRANSLATION_ENTRY_STATIC)) {
+ display_entry = 1;
+ }
+
+ if ((request_flag & CNAT_TRANSLATION_ENTRY_ALG) &&
+ (*entry_flag_ptr & CNAT_TRANSLATION_ENTRY_ALG)) {
+ display_entry = 1;
+ }
+
+ if ((request_flag & CNAT_TRANSLATION_ENTRY_PCPI_DYNAMIC) &&
+ (*entry_flag_ptr & CNAT_TRANSLATION_ENTRY_PCPI_DYNAMIC)) {
+ display_entry = 1;
+ }
+
+ if ((request_flag & CNAT_TRANSLATION_ENTRY_PCPE_DYNAMIC) &&
+ (*entry_flag_ptr & CNAT_TRANSLATION_ENTRY_PCPE_DYNAMIC)) {
+ display_entry = 1;
+ }
+
+ /*
+ * For dynamic entry case, check if flags field is 0
+ */
+ if ((request_flag & CNAT_TRANSLATION_ENTRY_DYNAMIC) &&
+ (*entry_flag_ptr & CNAT_TRANSLATION_ENTRY_DYNAMIC)) {
+ display_entry = 1;
+ }
+ }
+
+ if (PREDICT_FALSE(show_debug_level > 2)) {
+ PLATFORM_DEBUG_PRINT("Entry (0x%x, %d) -> (0x%x, %d) request_flag 0x%x, entry_flag 0x%x, display_entry %d\n", db->in2out_key.k.ipv4, db->in2out_key.k.port, db->out2in_key.k.ipv4, db->out2in_key.k.port, request_flag, *entry_flag_ptr, display_entry);
+ }
+
+ return (display_entry);
+}
+void cnat_v4_show_inside_entry_req_t_handler
+(spp_api_cnat_v4_show_inside_entry_req_t *mp, vlib_main_t * vm)
+{
+ cnat_user_db_entry_t *udb = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_db_key_bucket_t u_ki, ki;
+ u64 a, b, c;
+ u32 index;
+ u16 start_port, end_port, port;
+ u16 request_flag = 0;
+ u16 entry_flag = 0;
+ u8 num_entries = 0;
+ u8 proto, all;
+ u8 done = 0;
+ cnat_v4_show_translation_entry *entry_list;
+ cnat_v4_show_translation_entry entry[PLATFORM_MAX_TRANSLATION_ENTRIES];
+ u8 display_entry;
+ u8 flag_str[11];
+
+ ki.k.k.ipv4 = mp->ipv4_addr;
+ ki.k.k.vrf = mp->vrf_id;
+ start_port = mp->start_port;
+ end_port = mp->end_port;
+ //memset(flag_str,0x00,11);
+ //strncpy(flag_str,"NA",2);
+#if DEBUG
+ vlib_cli_output(vm, "## proto %d, inside-addr 0x%x, start_port %u, "
+ "end_port %u, vrf 0x%x, flag 0x%x\n",
+ mp->protocol,
+ mp->ipv4_addr,
+ mp->start_port,
+ mp->end_port,
+ mp->vrf_id,
+ mp->flags);
+#endif
+
+ proto = mp->protocol;
+ ki.k.k.vrf |= ((u16)proto << CNAT_PRO_SHIFT);
+
+ all = mp->all_entries; /* for no port range case */
+ request_flag = mp->flags; /* for all, alg, static entries case */
+ entry_list = entry;
+
+ /*
+ * check if the address is belonging to this core
+ */
+
+
+ /*
+ * first we check if the user exists in the udb, if he is not then
+ * it does not make sense to check the main db for translations
+ */
+ u_ki.k.k.vrf = ki.k.k.vrf & CNAT_VRF_MASK;
+ u_ki.k.k.ipv4 = ki.k.k.ipv4;
+ u_ki.k.k.port = 0;
+
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ vlib_cli_output(vm, "\nI_TRANS_CORE %d: IPv4 0x%x, VRF 0x%x, "
+ "start_port %d, end_port %d",
+ my_instance_number, ki.k.k.ipv4,
+ ki.k.k.vrf, start_port, end_port);
+ }
+
+ udb = cnat_user_db_lookup_entry(&u_ki);
+ if (!udb) {
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ vlib_cli_output(vm, "\nReturning %d entries",
+ num_entries);
+ }
+ return;
+ }
+
+ if (all) {
+ #if 0
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ PLATFORM_DEBUG_PRINT("\nI_TRANS: Printing ALL\n");
+ }
+
+ /*
+ * get the head of list of translation entries for that user
+ * from the user db
+ */
+ head = udb->translation_list_head_index;
+ db = cnat_main_db + head;
+
+ while (num_entries < PLATFORM_MAX_TRANSLATION_ENTRIES) {
+
+ if (((db->in2out_key.k.vrf & CNAT_PRO_MASK) >> CNAT_PRO_SHIFT)
+ != proto) {
+ goto next_entry;
+ }
+
+ display_entry =
+ spp_api_cnat_v4_show_verify_display_entry(request_flag, db,
+ &entry_flag);
+
+ if (display_entry) {
+ entry_list->ipv4_addr =
+ spp_host_to_net_byte_order_32(db->out2in_key.k.ipv4);
+ entry_list->cnat_port =
+ spp_host_to_net_byte_order_16(db->out2in_key.k.port);
+ entry_list->src_port =
+ spp_host_to_net_byte_order_16(db->in2out_key.k.port);
+
+ entry_list->protocol = proto;
+
+ /* incase of gre - in2out is not accounted */
+ if(proto != CNAT_PPTP) {
+
+ entry_list->in2out_packets =
+ spp_host_to_net_byte_order_32(db->in2out_pkts);
+ } else {
+ entry_list->in2out_packets = 0;
+ }
+ entry_list->out2in_packets =
+ spp_host_to_net_byte_order_32(db->out2in_pkts);
+
+ entry_list->flags =
+ spp_host_to_net_byte_order_16(entry_flag);
+
+ num_entries++;
+ entry_list = entry_list + 1;
+ }
+next_entry:
+ db = cnat_main_db + db->user_ports.next;
+ /*
+ * its a circular list, so if we have reached the head again
+ * all the entries for that user have been read
+ */
+ if (db == (cnat_main_db + head)) {
+ break;
+ }
+ }
+ resp->num_entries = num_entries;
+ #endif /* if 0 */
+ } else {
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ vlib_cli_output(vm, "\nI_TRANS: Printing range %d .. %d\n",
+ start_port, end_port);
+ }
+ /*
+ * port range is specified so for each port calculate the hash and
+ * check if the entry is present in main db
+ */
+ port = start_port;
+ done = 0;
+ while ((!done) && (num_entries < PLATFORM_MAX_TRANSLATION_ENTRIES)) {
+
+ ki.k.k.port = port;
+ if (port >= end_port) {
+ done = 1;
+ } else {
+ port++;
+ }
+ CNAT_V4_GET_HASH(ki.k.key64,
+ ki.bucket,
+ CNAT_MAIN_HASH_MASK);
+ index = cnat_in2out_hash[ki.bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ continue;
+ }
+
+ do {
+ db = cnat_main_db + index;
+ if (db->in2out_key.key64 == ki.k.key64) {
+ break;
+ }
+ index = db->in2out_hash.next;
+ } while (index != EMPTY);
+
+ if (index == EMPTY) {
+ continue;
+ } else {
+
+ display_entry =
+ cnat_v4_show_verify_display_entry(request_flag, db,
+ &entry_flag);
+ if (display_entry) {
+
+ entry_list->ipv4_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+ entry_list->cnat_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+ entry_list->src_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ entry_list->protocol = proto;
+ entry_list->nsessions = db->nsessions;
+ entry_list->flags = ((db->flags & CNAT_DB_FLAG_TCP_ACTIVE) ||
+ (db->flags & CNAT_DB_FLAG_UDP_ACTIVE)) ? 1:0;
+ /* incase of gre - in2out is not accounted */
+ if(proto != CNAT_PPTP) {
+ entry_list->in2out_packets =
+ clib_host_to_net_u32(db->in2out_pkts);
+ } else {
+ entry_list->in2out_packets = 0;
+ }
+
+ entry_list->out2in_packets =
+ clib_host_to_net_u32(db->out2in_pkts);
+
+ if (PREDICT_FALSE(show_debug_level > 3)) {
+ vlib_cli_output(vm, "\n1. Entry: Addr 0x%x, port %d, num_entries %d",
+ clib_net_to_host_u32(entry_list->ipv4_addr),
+ clib_net_to_host_u16(entry_list->cnat_port),
+ num_entries);
+ }
+
+ entry_list = entry_list + 1;
+ num_entries++;
+ }
+ } /* if (index == EMPTY) */
+ } /* while() */
+ }
+
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ if (num_entries) {
+ vlib_cli_output(vm, "\nReturning %d entries\n",
+ num_entries);
+ }
+ }
+
+ entry_list = entry;
+ u8 i = 0;
+ struct in_addr ip;
+ u8 proto_str[10];
+ u8 transl_str[10];
+ memset(proto_str, 0x00, 10);
+ memset(transl_str, 0x00, 10);
+
+ if (proto == 1) strncpy((char *)proto_str, "udp", 3);
+ else if (proto == 2) strncpy((char *)proto_str, "tcp", 3);
+ else if (proto == 3) strncpy((char *)proto_str, "icmp", 4);
+ else strncpy((char *)proto_str, "unknown", 7);
+
+ if (request_flag == 0x04) strncpy((char *)transl_str, "Dynamic", 7);
+ else strncpy((char *)transl_str, "Unknown", 7); /* currently we are not supporting static/alg entries */
+
+ ip.s_addr = clib_net_to_host_u32(u_ki.k.k.ipv4);
+
+ vlib_cli_output (vm, "Inside-translation details\n");
+ vlib_cli_output (vm, "--------------------------\n");
+
+ vlib_cli_output (vm, "Inside interface index : 0x%x\n", u_ki.k.k.vrf);
+ vlib_cli_output (vm, "Inside address : %s\n", inet_ntoa(ip));
+ vlib_cli_output (vm, "Start port : %u\n", start_port);
+ vlib_cli_output (vm, "End port : %u\n", end_port);
+
+ vlib_cli_output (vm, "--------------------------------------------------------------------------------------"
+ "-----------------------\n");
+ vlib_cli_output (vm, "Outside Protocol Inside Outside Translation"
+ " I2O O2I Flag Num\n");
+ vlib_cli_output (vm, "Address Src Port Src Port Type "
+ " Pkts Pkts Sessions\n");
+ vlib_cli_output (vm, "--------------------------------------------------------------------------------------"
+ "-----------------------\n");
+
+ while ((num_entries) && (entry_list) && (i < 50)) {
+
+ ip.s_addr = entry_list->ipv4_addr;
+ memset(flag_str,0x00,11);
+ if((proto == 1) || (proto == 2)) {
+ if(entry_list->flags == 1) {
+ strncpy((char *)flag_str,"Active",6);
+ }
+ else {
+ strncpy((char *) flag_str,"Non Active",10);
+ }
+ } else {
+ strncpy((char *) flag_str, "NA", 2);
+ }
+ vlib_cli_output(vm, "%s %10s %11u %12u %13s %10u %10u %14s %6u\n",
+ inet_ntoa(ip), proto_str,
+ clib_net_to_host_u16(entry_list->src_port),
+ clib_net_to_host_u16(entry_list->cnat_port),
+ transl_str,
+ clib_net_to_host_u32(entry_list->in2out_packets),
+ clib_net_to_host_u32(entry_list->out2in_packets),
+ flag_str,
+ entry_list->nsessions);
+ entry_list++;
+ num_entries--; i++;
+ }
+
+ return;
+}
+
+void cnat_v4_show_outside_entry_req_t_handler
+(spp_api_cnat_v4_show_outside_entry_req_t *mp, vlib_main_t *vm)
+{
+ cnat_main_db_entry_t *db = NULL;
+ cnat_db_key_bucket_t ko;
+ u64 a, b, c;
+ u32 index;
+ u16 start_port, end_port, port;
+ u16 request_flag = 0;
+ u16 entry_flag = 0;
+ u8 num_entries = 0;
+ u8 proto;
+ cnat_v4_show_translation_entry *entry_list;
+ cnat_v4_show_translation_entry entry[PLATFORM_MAX_TRANSLATION_ENTRIES];
+ u8 done = 0;
+ u8 display_entry;
+ u8 flag_str[11];
+
+ ko.k.k.ipv4 = mp->ipv4_addr;
+ ko.k.k.vrf = mp->vrf_id;
+ start_port = mp->start_port;
+ end_port = mp->end_port;
+
+ proto = mp->protocol;
+ request_flag = mp->flags;
+
+ ko.k.k.vrf |= ((u16)proto << CNAT_PRO_SHIFT);
+
+ entry_list = entry;
+
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ vlib_cli_output(vm, "\nO_TRANS_CORE %d: IPv4 0x%x, VRF 0x%x, "
+ "start_port %d, end_port %d", my_instance_number,
+ ko.k.k.ipv4, ko.k.k.vrf, start_port, end_port);
+ }
+
+ /*
+ * for each ip and port combination we need to scan the main db
+ * and check if the entry is present in main db
+ */
+ port = start_port;
+ done = 0;
+ while ((!done) && (num_entries < PLATFORM_MAX_TRANSLATION_ENTRIES)) {
+ ko.k.k.port = port;
+
+ /*
+ * If we have reached the end_port, we are DONE
+ */
+ if (port >= end_port) {
+ done = 1;
+ } else {
+ port++;
+ }
+
+ CNAT_V4_GET_HASH(ko.k.key64,
+ ko.bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ index = cnat_out2in_hash[ko.bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ continue;
+ }
+
+ do {
+ db = cnat_main_db + index;
+ if (db->out2in_key.key64 == ko.k.key64) {
+ break;
+ }
+ index = db->out2in_hash.next;
+ } while (index != EMPTY);
+
+ if (index == EMPTY) {
+ continue;
+ } else {
+ display_entry =
+ cnat_v4_show_verify_display_entry(request_flag, db,
+ &entry_flag);
+
+ if (display_entry) {
+ entry_list->ipv4_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ entry_list->cnat_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+ entry_list->src_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ entry_list->protocol = proto;
+ entry_list->nsessions = db->nsessions;
+ entry_list->flags = ((db->flags & CNAT_DB_FLAG_TCP_ACTIVE) ||
+ (db->flags & CNAT_DB_FLAG_UDP_ACTIVE)) ? 1:0;
+ /* incase of gre - in2out is not accounted */
+ if(proto != CNAT_PPTP) {
+ entry_list->in2out_packets =
+ clib_host_to_net_u32(db->in2out_pkts);
+ } else {
+ entry_list->in2out_packets = 0 ;
+ }
+ entry_list->out2in_packets =
+ clib_host_to_net_u32(db->out2in_pkts);
+ #if 0
+ entry_list->flags =
+ clib_host_to_net_u16(entry_flag);
+ #endif
+ entry_list = entry_list + 1;
+ num_entries++;
+ }
+ }
+ }
+
+ if (num_entries == 0) {
+ /* No point proceeding further */
+ return;
+ }
+
+ if (PREDICT_FALSE(show_debug_level > 0)) {
+ if (num_entries) {
+ vlib_cli_output(vm, "\nO_TRANS: Core %d returning %d entries",
+ num_entries);
+ }
+ }
+
+ entry_list = entry;
+ u8 i = 0;
+ struct in_addr ip;
+ u8 proto_str[10];
+ u8 transl_str[10];
+ memset(proto_str, 0x00, 10);
+ memset(transl_str, 0x00, 10);
+
+ if (proto == 1) strncpy((char *) proto_str, "udp", 3);
+ else if (proto == 2) strncpy((char *) proto_str, "tcp", 3);
+ else if (proto == 3) strncpy((char *) proto_str, "icmp", 4);
+ else strncpy((char *) proto_str, "unknown", 7);
+
+ if (request_flag == 0x04) strncpy((char *) transl_str, "Dynamic", 7);
+ else strncpy((char *)transl_str, "Unknown", 7); /* currently we are not supporting static/alg entries */
+
+ ip.s_addr = clib_net_to_host_u32(ko.k.k.ipv4);
+
+ vlib_cli_output (vm, "Outside-translation details\n");
+ vlib_cli_output (vm, "--------------------------\n");
+
+ vlib_cli_output (vm, "Outside interface index : 0x%x\n", (ko.k.k.vrf & CNAT_VRF_MASK));
+ vlib_cli_output (vm, "Outside address : %s\n", inet_ntoa(ip));
+ vlib_cli_output (vm, "Start port : %u\n", start_port);
+ vlib_cli_output (vm, "End port : %u\n", end_port);
+
+ vlib_cli_output (vm, "--------------------------------------------------------------------------------------"
+ "-----------------------\n");
+ vlib_cli_output (vm, "Inside Protocol Outside Inside Translation"
+ " I2O O2I Flag Num\n");
+ vlib_cli_output (vm, "Address Dst Port Dst Port Type "
+ " Pkts Pkts Sessions\n");
+ vlib_cli_output (vm, "--------------------------------------------------------------------------------------"
+ "-----------------------\n");
+
+ while ((num_entries) && (entry_list) && (i < 50)) {
+ ip.s_addr = entry_list->ipv4_addr;
+ memset(flag_str,0x00,11);
+ if((proto == 1) || (proto == 2)) {
+ if(entry_list->flags == 1) {
+ strncpy((char *) flag_str,"Active",6);
+ }
+ else {
+ strncpy((char *) flag_str,"Non Active",10);
+ }
+ } else {
+ strncpy((char *) flag_str, "NA", 2);
+ }
+ vlib_cli_output(vm, "%s %10s %11u %12u %13s %10u %10u %14s %6u\n",
+ inet_ntoa(ip), proto_str,
+ clib_net_to_host_u16(entry_list->cnat_port),
+ clib_net_to_host_u16(entry_list->src_port),
+ transl_str,
+ clib_net_to_host_u32(entry_list->in2out_packets),
+ clib_net_to_host_u32(entry_list->out2in_packets),
+ flag_str,
+ entry_list->nsessions);
+ entry_list++;
+ num_entries--; i++;
+
+ }
+ return;
+}
diff --git a/vnet/vnet/vcgn/cnat_common_api.h b/vnet/vnet/vcgn/cnat_common_api.h
new file mode 100644
index 00000000000..a4eb74432f2
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_common_api.h
@@ -0,0 +1,22 @@
+/*---------------------------------------------------------------------------
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#ifndef __CNAT_COMMON_API_H__
+#define __CNAT_COMMON_API_H__
+
+/* All common API prototypes */
+void cnat_scanner_db_process_turn_on(vlib_main_t *vm);
+
+#endif
diff --git a/vnet/vnet/vcgn/cnat_config.c b/vnet/vnet/vcgn/cnat_config.c
new file mode 100644
index 00000000000..87183dfa961
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_config.c
@@ -0,0 +1,77 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_config.c - configuration definitions
+ *
+ * Copyright (c) 2007-2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include "cnat_config.h"
+#include "cnat_cli.h"
+#include "cnat_v4_pptp_alg.h"
+#include "platform_common.h"
+
+/* session timeout */
+
+u16 tcp_initial_setup_timeout = V4_DEF_TCP_IS_TO; /* sec */
+u16 tcp_active_timeout = V4_DEF_TCP_AS_TO; /* sec */
+u16 udp_init_session_timeout = V4_DEF_UDP_IS_TO; /* 30 sec */
+u16 udp_act_session_timeout = V4_DEF_UDP_AS_TO; /* 2 min */
+u16 icmp_session_timeout = V4_DEF_ICMP_S_TO; /* 60 sec */
+
+cnat_pptp_config_t pptp_cfg =
+ {
+ .enable = PPTP_DISABLED,
+ .timeout = PPTP_GRE_TIMEOUT
+ } ;
+
+/* This flag is used as indication of timeout related config
+ * changes and hence db needs to be updated
+ */
+u8 timeout_dirty_flag = 0;
+
+/* mapping refresh direction,
+ * 1 inbound and outbound refresh
+ */
+u8 mapping_refresh_both_direction = V4_DEF_ENABLE;
+
+u16 cnat_main_db_max_ports_per_user = V4_DEF_MAX_PORTS;
+
+u32 cnat_main_db_icmp_rate_limit = DEF_RATE_LIMIT;
+u32 cnat_main_db_icmp_rate_limit_core = DEF_RATE_LIMIT_CORE;
+u32 crc_zero_udp_rate_limit_core = RATE_LIMIT_UDP_CORE;
+u16 cnat_static_port_range = CNAT_DEF_STATIC_PORT_RANGE;
+
+
+/*
+ * ftp alg enable
+ */
+u8 ftp_alg_enabled = V4_DEF_DISABLE;
+u16 rtsp_alg_port_num = 0;
+
+/*
+ * load balancing debug mode
+ */
+u8 lb_debug_enable = V4_DEF_DISABLE;
+
+
+/* good or evil mode
+ * 0 endpoint-independnet filter, good mode
+ * 1 address depedent filter, evil mode
+ */
+u8 address_dependent_filtering = V4_DEF_DISABLE;
+
+u16 per_user_icmp_msg_limit = ICMP_MSG_RATE_LIMIT;
+
+u16 config_delete_timeout = V4_CONFIG_DELETE_TO;
+
diff --git a/vnet/vnet/vcgn/cnat_config.h b/vnet/vnet/vcgn/cnat_config.h
new file mode 100644
index 00000000000..f104273716f
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_config.h
@@ -0,0 +1,582 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_config.h - configuration database definitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_CONFIG_H__
+#define __CNAT_CONFIG_H__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include "cnat_bulk_port_defs.h"
+
+/* default policy value */
+#define V4_DEF_ICMP_S_TO 60 /*icmp session timeout */
+#define V4_DEF_UDP_IS_TO 30 /*udp init session timeout */
+#define V4_DEF_UDP_AS_TO 120 /*udp active session timeout */
+#define V4_DEF_TCP_IS_TO 120 /*tcp init session timeout */
+#define V4_DEF_TCP_AS_TO 1800 /*tcp active session timeout, 30 min */
+#define V4_DEF_TCP_MSS 1460 /*tcp mss */
+#define V4_DEF_MAX_PORTS 100 /*max port limit per user */
+#define DEF_RATE_LIMIT PLATFORM_MAX_CORES /* No of packets/sec icmp generated */
+#define DEF_RATE_LIMIT_CORE 1 /* No of packets/sec icmp generated (per core) */
+#define RATE_LIMIT_UDP_CORE 1000 /* Max allowed udp crc zero packets/sec/core */
+
+#define NAT44_RESERVED_INST_ID 1
+#define DSLITE_START_ID (NAT44_RESERVED_INST_ID + 1)
+#define V4_DEF_VRF_MAX_PORTS 0 /*max port limit per vrf user;
+ 0 means use the global port limit for user*/
+/*Hardcoded . TBD - can be made configurable */
+
+#define V4_DEF_ENABLE 1 /* feature enable */
+#define V4_DEF_DISABLE 0 /* feature disable */
+
+#define CNAT_DEF_STATIC_PORT_RANGE 1024 /* Default range for static ports */
+/*
+ * If TCP MSS is not configured, store the maximum possible value
+ */
+#define V4_TCP_MSS_NOT_CONFIGURED_VALUE 0xffff
+
+/* default timeout for fragments in seconds set to 2
+ * in case its not configured
+ */
+#define CNAT_IPV4_FRAG_TIMEOUT_DEF 2
+/* other */
+/* max db entries to be scaned */
+#define MAX_DB_ENTRY_PER_SCAN PLATFORM_MAX_DB_ENTRY_PER_SCAN
+/* max db entries selected per scan */
+#define MAX_DB_ENTRY_SELECTED_PER_SCAN PLATFORM_MAX_DB_ENTRY_SELECTED_PER_SCAN
+
+#define ICMP_MSG_RATE_LIMIT 3 /* rate limit for icmp message */
+#define V4_CONFIG_DELETE_TO 600 /* timeout for entry to be deleted */
+
+/* session timeout */
+
+extern u16 tcp_initial_setup_timeout;
+extern u16 tcp_active_timeout;
+extern u16 udp_init_session_timeout;
+extern u16 udp_act_session_timeout;
+extern u16 icmp_session_timeout;
+
+extern u8 timeout_dirty_flag;
+
+/* mapping refresh direction,
+ * 0 outbound only refresh,
+ * 1 inbound and outbound refresh
+ */
+extern u8 mapping_refresh_both_direction;
+
+
+extern u16 cnat_main_db_max_ports_per_user;
+extern u32 cnat_main_db_icmp_rate_limit;
+extern u32 cnat_main_db_icmp_rate_limit_core;
+extern u32 crc_zero_udp_rate_limit_core;
+
+extern u16 cnat_static_port_range;
+
+typedef enum {
+ LOG_FORMAT_UNDEFINED =0,
+ LOG_FORMAT_COMPACT,
+ LOG_FORMAT_NF9,
+ LOG_FORMAT_MAX, /* keep this as last */
+} log_format_t;
+
+typedef enum {
+ CNAT_CONFIG_DEL_OP = 0,
+ CNAT_CONFIG_ADD_OP,
+} cnat_op_flag_t;
+
+extern u8 ftp_alg_enabled;
+extern u16 rtsp_alg_port_num;
+
+/*
+ * load balancing debug mode
+ */
+extern u8 lb_debug_enable;
+
+/* good or evil mode
+ * 0 endpoint-independnet filter, good mode
+ * 1 address depedent filter, evil mode
+ */
+extern u8 address_dependent_filtering;
+
+extern u16 per_user_icmp_msg_limit;
+
+/* vrfmap or portmap holding time
+ * after delete
+ */
+extern u16 config_delete_timeout;
+
+/*
+ * Bit map for various configuration in the POLICY KNOB case
+ */
+#define BIDIR_REFRESH_ENABLE 0x01
+#define BIDIR_REFRESH_DISABLE 0x02
+#define FTP_ALG_ENABLE 0x04
+#define FTP_ALG_DISABLE 0x08
+#define DEFAULT_NFV9_LOGGING_SERVER_ENABLE 0x10
+#define DEFAULT_NFV9_LOGGING_SERVER_DISABLE 0x20
+
+
+/*
+ * This structure contains a single VRF map configuration
+ * from a bulk message. This structure is in conformanace
+ * with the following structures defined in cnat_config_api.h
+ * - spp_api_cnat_v4_bulk_vrf_map_t
+ *
+ * Any change in the above structures should be propagated here
+ */
+typedef struct _spp_api_cnat_v4_single_vrf_map_req {
+ u32 i_vrf_id;
+ u32 o_vrf_id;
+
+ u16 i_vrf;
+ u16 o_vrf;
+
+ u32 start_addr;
+ u32 end_addr;
+
+ u16 vrf_policy_enable;
+#define TCP_MSS_ENABLE 0x0001
+#define TCP_MSS_DISABLE 0x0002
+#define NFV9_LOGGING_ENABLE 0x0004
+#define NFV9_LOGGING_DISABLE 0x0008
+#define VRF_MAP_DELETE 0x0010
+#define VRF_MAP_ADD 0x0020
+#define BULK_ALLOC_CHANGE 0x0040
+
+ u16 tcp_mss_value;
+ u32 vrf_nfv9_logging_ipv4_address;
+ u16 vrf_nfv9_logging_udp_port;
+ u16 vrf_nfv9_refresh_rate;
+ u16 vrf_nfv9_timeout_rate;
+ u16 vrf_nfv9_path_mtu;
+#ifndef NO_BULK_LOGGING
+ bulk_alloc_size_t bulk_size;
+#endif /* NO_BULK_LOGGING */
+} spp_api_cnat_v4_single_vrf_map_req;
+
+typedef struct _spp_api_cnat_v4_single_vrf_map_rc {
+ u8 vrf_map_rc;
+ u8 tcp_mss_rc;
+ u8 nfv9_logging_rc;
+ u8 pad;
+} spp_api_cnat_v4_single_vrf_map_rc;
+
+/*
+ * Bulk Response for the VRF map request
+ */
+typedef struct _spp_api_cnat_v4_bulk_vrf_map_resp {
+ u16 _spp_msg_id;
+ u8 bulk_rc;
+ u8 pad;
+
+ u32 num_vrfmap_entries;
+
+ spp_api_cnat_v4_single_vrf_map_rc vrf_map_rc;
+
+} spp_api_cnat_v4_bulk_vrf_map_resp;
+
+/*
+ * Bulk Response for the Policy Knob request
+ */
+typedef struct _spp_api_cnat_v4_bulk_policy_knob_resp {
+ u16 _spp_msg_id;
+ u8 bulk_rc; /* Global rc code */
+ u8 pad;
+
+ u8 port_limit_rc;
+ u8 icmp_timeout_rc;
+ u8 udp_init_timeout_rc;
+ u8 udp_act_timeout_rc;
+
+ u8 tcp_init_timeout_rc;
+ u8 tcp_act_timeout_rc;
+ u8 nfv9_logging_rc;
+ u8 pad2;
+} spp_api_cnat_v4_bulk_policy_knob_resp;
+
+
+/* PPTP ALG defs and structures */
+
+/* dont change the order..
+ maintened at offset mapped to msg ids */
+
+typedef struct pptp_ctrl_msg_ctrs_t {
+ u64 dummy;
+ u64 sccr;
+ u64 sccrp;
+ u64 stccrq;
+ u64 stccrp;
+ u64 erq;
+ u64 erp;
+ u64 ocrq;
+ u64 ocrp;
+ u64 icrq;
+ u64 icrp;
+ u64 iccn;
+ u64 cclr;
+ u64 cdn;
+ u64 wen;
+ u64 sli;
+}pptp_ctrl_msg_ctrs_t;
+
+#define PPTP_INCR(ctr) pptp_cfg.counters.pptp_##ctr++
+#define PPTP_DECR(ctr) pptp_cfg.counters.pptp_##ctr--
+
+typedef struct pptp_counters_t {
+
+ u64 pptp_ctrl_msg_drops;
+ u64 pptp_active_tunnels;
+ u64 pptp_active_channels;
+ u64 pptp_in2out_gre_drops;
+ u64 pptp_out2in_gre_drops;
+ u64 pptp_in2out_gre_fwds;
+ u64 pptp_out2in_gre_fwds;
+ pptp_ctrl_msg_ctrs_t ctrl_ctrs;
+
+} pptp_counters_t;
+
+#define CNAT_PPTP_ENABLE 1
+#define CNAT_PPTP_DEF_TIMEOUT 60 /* secs */
+
+typedef struct cnat_pptp_config_t {
+ u8 enable;
+ u16 timeout;
+ pptp_counters_t counters;
+
+} cnat_pptp_config_t;
+
+
+#define CNAT_PPTP_ENABLE_FLAG 0x01
+#define CNAT_PPTP_TIMEOUT_FLAG 0x02
+
+/* pptp config msg resp */
+typedef struct _spp_api_cnat_v4_config_pptp_alg_resp {
+ u16 _spp_msg_id;
+ u8 bulk_rc;
+ u8 pad;
+
+} spp_api_cnat_v4_config_pptp_alg_resp_t;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad[5];
+
+ /* better to have a group structures rather than individual
+ variables, any change in counters is will automatically
+ reflect here */
+ pptp_counters_t counters;
+} pptp_show_counters_resp_t ;
+
+
+extern cnat_pptp_config_t pptp_cfg;
+
+
+/* ========= 6RD declarations =============================== */
+
+#define V6RD_ENTRY_DELETE 0x00
+#define IPV4_TUNNEL_SRC_CONFIG 0x04
+#define TUNNEL_MTU_CONFIG 0x08
+#define IPV4_PREFIXMASK_LEN_CONFIG 0x10
+#define IPV4_SUFFIXMASK_LEN_CONFIG 0x20
+#define TTL_CONFIG 0x40
+#define TOS_CONFIG 0x80
+#define V6RD_IPV6_PREFIX_CONFIG 0x100
+#define V6RD_RESET_DF_BIT_CONFIG 0x200
+#define V6RD_UNICAST_ADDR_CONFIG 0x400
+#define V6RD_REASSEMB_CONFIG 0x800
+
+#define TTL_ENABLE 0x1
+#define TOS_ENABLE 0x2
+#define RESET_DF_BIT 0x4
+#define REASSEMBLY_ENABLE 0x8
+
+/* ========= 6RD declarations =============================== */
+
+/*
+ * Single Request for XLAT config
+ */
+typedef struct _spp_api_cnat_v4_single_xlat_config_req {
+
+ /*
+ * Indicates the xlat instance id - How big will this value be
+ * Can we restrict it between 0..255, that way the APP code
+ * can use an array to store the xlat instances.
+ */
+ u32 xlat_id;
+
+#define XLAT_ENTRY_DELETE 0x0000
+#define IPV6_SVI_IF_NUM_CONFIG 0x0001
+#define IPV4_SVI_IF_NUM_CONFIG 0x0002
+#define IPV4_TO_IPV6_TCP_MSS_CONFIG 0x0004
+#define IPV6_TO_IPV4_TCP_MSS_CONFIG 0x0008
+#define IPV6_PREFIX_CONFIG 0x0010
+#define IPV6_UBIT_ON_CONFIG 0x0020
+#define IPV6_NON_TRANSLATABLE_PREFIX_MAP_CONFIG 0x0040
+#define IPV4_TOS_SETTING_CONFIG 0x0080
+#define IPV6_TOS_SETTING_CONFIG 0x0100
+#define IPV4_DFBIT_CLEAR_CONFIG 0x0200
+#define ICMPV6_PTB_MTU_SET_CONFIG 0x0400
+#define IPV6_NON_TRANSLATABLE_PREFIX_MAP_ALG_CONFIG 0x0800
+#define CPE_V4_PREFIX_CONFIG 0x1000 /* for map-t */
+#define CPE_V6_PREFIX_CONFIG 0x2000 /* for map-t */
+#define EXTERNAL_V6_PREFIX_CONFIG 0x4000 /* for map-t */
+#define PORT_SHARING_RATIO_CONFIG 0x8000 /* for map-t */
+#define CONSECUTIVE_PORTS_CONFIG 0x10000 /* for map-t */
+
+ u32 xlat_config_fields_enable;
+
+ /*
+ * If numbers of the IPv6 and IPv4 SVI interfaces
+ */
+ u32 ipv6_svi_if_num;
+ u32 ipv4_svi_if_num;
+
+ /*
+ * TCP MSS values for the 2 XLAT directions
+ */
+ u16 v4_to_v6_tcp_mss;
+ u16 v6_to_v4_tcp_mss;
+
+ /*
+ * XLAT IPv6 prefix
+ */
+ u32 v6_prefix[4];
+
+ /*
+ * XLAT IPv6 prefix mask
+ */
+ u8 v6_prefix_mask_len;
+
+ /*
+ * Set to non-zero if UBITs are reserved
+ */
+#define UBITS_ON 0x01
+#define IPV4_DF_BIT_CLEAR 0x02
+#define ICMPV6_MTU_SET 0x04
+#define IPV4_TOS_SET_ENABLED 0x08
+#define IPV6_TC_SET_ENABLED 0x10
+
+ u8 feature_enable_bits;
+
+ u8 v4_prefix_mask_len;
+
+#define IPV6_NON_TRANSLATABLE_PREFIX_MAP_ALG_HASH 0x1
+#define IPV6_NON_TRANSLATABLE_PREFIX_MAP_ALG_TTL 0x2
+#define IPV6_NON_TRANSLATABLE_PREFIX_MAP_ALG_RANDOM 0x3
+ u8 non_translatable_v6_prefix_v4_map_prefix_alg;
+
+ u8 ipv6_tos_value;
+
+ u8 ipv4_tos_value;
+
+ u8 pad2;
+
+ u8 pad3;
+
+ u32 v4_prefix;
+
+ /*
+ * MAP-T/MAP-E specific parameters
+ */
+ u8 xlat_type;
+
+ u32 cpe_domain_v6_prefix[4];
+ u8 cpe_domain_v6_prefix_len;
+
+ u32 cpe_domain_v4_prefix;
+ u8 cpe_domain_v4_prefix_len;
+
+ u32 external_domain_v6_prefix[4];
+ u8 external_domain_v6_prefix_len;
+
+ u8 port_sharing_ratio_bits;
+ u8 consecutive_ports_bits;
+
+} spp_api_cnat_v4_single_xlat_config_req;
+
+/*
+ * Single Response for the xlat config request
+ */
+typedef struct _spp_api_cnat_v4_single_xlat_config_resp {
+ u8 v4_if_num_rc;
+ u8 v6_if_num_rc;
+ u8 v4_to_v6_tcp_mss_rc;
+ u8 v6_to_v4_tcp_mss_rc;
+
+ u8 v6_prefix_rc;
+ u8 ubit_on_rc;
+ u8 v4_prefix_rc;
+ u8 xlat_id_rc;
+
+ u8 non_translatable_v6_prefix_v4_map_alg_rc;
+ u8 ipv4_dfbit_clear_rc;
+ u8 icmpv6_ptb_mtu_set_rc;
+ u8 ipv4_tos_set_rc;
+
+ u8 ipv6_tos_set_rc;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+} spp_api_cnat_v4_single_xlat_config_resp;
+
+/*
+ * Bulk Response for the xlat config request
+ */
+typedef struct _spp_api_cnat_v4_bulk_xlat_config_resp {
+ u16 _spp_msg_id;
+ u16 pad;
+
+ u32 bulk_rc;
+
+ u32 num_xlat_entries;
+
+ spp_api_cnat_v4_single_xlat_config_resp xlat_config_resp;
+
+} spp_api_cnat_v4_bulk_xlat_config_resp;
+
+typedef struct _spp_api_v6rd_v4_single_v6rd_config_resp {
+ u8 v6rd_id_rc;
+ u8 v4_if_num_rc;
+ u8 v6_if_num_rc;
+ u8 tunnel_source_rc;
+ u8 tunnel_mtu_rc;
+ u8 ipv4masklen_prefix_rc;
+ u8 ipv4masklen_suffix_rc;
+ u8 ttl_rc;
+ u8 tos_rc;
+ u8 anycast_rc;
+ u8 v6_prefix_rc;
+ u8 v6_br_unicast_rc;
+ u8 reassembly_rc;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+} spp_api_v6rd_v4_single_v6rd_config_resp_t;
+
+typedef struct _spp_api_v6rd_v4_bulk_v6rd_config_resp {
+ u16 _spp_msg_id;
+ u16 pad;
+ u32 bulk_rc;
+ u32 num_v6rd_entries;
+ spp_api_v6rd_v4_single_v6rd_config_resp_t v6rd_config_resp[0];
+} spp_api_v6rd_v4_bulk_v6rd_config_resp_t;
+
+/*
+ * Single Request for MAPE config
+ */
+typedef struct _spp_api_mape_single_config_req {
+
+ /*
+ * Indicates the mape instance id - How big will this value be
+ * Can we restrict it between 0..255, that way the APP code
+ * can use an array to store the xlat instances.
+ */
+ u32 mape_id;
+
+#define MAPE_ENTRY_DELETE 0x0000
+#define MAPE_IPV4_SVI_IF_NUM_CONFIG 0x0001
+#define MAPE_IPV6_SVI_IF_NUM_CONFIG 0x0002
+#define MAPE_IPV4_TO_IPV6_TCP_MSS_CONFIG 0x0004
+#define MAPE_IPV6_TO_IPV4_TCP_MSS_CONFIG 0x0008
+#define MAPE_CPE_V4_PREFIX_CONFIG 0x0010
+#define MAPE_CPE_V6_PREFIX_CONFIG 0x0020
+#define MAPE_PORT_SHARING_RATIO_CONFIG 0x0040
+#define MAPE_CONSECUTIVE_PORTS_CONFIG 0x0080
+#define MAPE_PATH_MTU 0x0100
+#define MAPE_TUNNEL_ENDPOINT_V6_CONFIG 0x0200
+
+ u32 mape_config_fields_enable;
+
+ /*
+ * If numbers of the IPv6 and IPv4 SVI interfaces
+ */
+ u32 ipv6_svi_if_num;
+ u32 ipv4_svi_if_num;
+
+ /*
+ * TCP MSS values for the 2 XLAT directions
+ */
+ u16 v4_to_v6_tcp_mss;
+ u16 v6_to_v4_tcp_mss;
+
+ /*
+ * Path v6 MTU.
+ */
+ u32 path_mtu;
+
+ /*
+ * CPE IPv6 prefix and mask len.
+ */
+ u32 cpe_domain_v6_prefix[4];
+ u8 cpe_domain_v6_prefix_len;
+
+ /*
+ * CPE IPv4 prefix and mask len.
+ */
+ u32 cpe_domain_v4_prefix;
+ u8 cpe_domain_v4_prefix_len;
+
+ /*
+ * BR IPv6 tunnel end point V6 prefix and mask len.
+ */
+ u32 aftr_tunnel_endpoint_address_v6[4];
+ u8 aftr_tunnel_endpoint_address_v6_len;
+
+ /*
+ * BR IPv6 tunnel end point V6 prefix and mask len.
+ */
+ u8 port_sharing_ratio_bits;
+ u8 consecutive_ports_bits;
+
+} spp_api_mape_single_config_req;
+
+
+/*
+ * Single Response for the mape config response
+ */
+typedef struct _spp_api_mape_single_config_resp {
+ u8 v4_if_num_rc;
+ u8 v6_if_num_rc;
+ u8 v4_to_v6_tcp_mss_rc;
+ u8 v6_to_v4_tcp_mss_rc;
+ u8 mape_id_rc;
+ u8 path_mtu_rc;
+ u8 cpe_v6_prefix_rc;
+ u8 cpe_v4_prefix_rc;
+ u8 tunnel_endpoint_prefix_rc;
+ u8 port_sharing_ratio_rc;
+ u8 port_contiguous_rc;
+ u8 pad1;
+} spp_api_mape_single_config_resp;
+
+/*
+ * Bulk Response for the mape config request
+ */
+typedef struct _spp_api_mape_bulk_config_resp {
+ u16 _spp_msg_id;
+ u16 pad;
+ u32 bulk_rc;
+ u32 num_mape_entries;
+ spp_api_mape_single_config_resp mape_config_resp;
+} spp_api_mape_bulk_config_resp;
+
+
+#endif /* __CNAT_CONFIG_H__ */
diff --git a/vnet/vnet/vcgn/cnat_config_api.h b/vnet/vnet/vcgn/cnat_config_api.h
new file mode 100644
index 00000000000..0789d6a92af
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_config_api.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CNAT_CONFIG_API_H__
+#define __CNAT_CONFIG_API_H__
+
+typedef struct _spp_api_cnat_v4_add_vrf_map {
+ u16 _spp_msg_id;
+ u8 rc;
+ u8 pad;
+ u32 i_vrf_id;
+ u32 o_vrf_id;
+ u16 i_vrf;
+ u16 o_vrf;
+ u32 start_addr[8];
+ u32 end_addr[8];
+} spp_api_cnat_v4_add_vrf_map_t;
+
+typedef struct _spp_api_cnat_v4_config_nfv9_logging {
+ u16 _spp_msg_id;
+ u8 rc;
+ u8 enable;
+ u32 ipv4_address;
+ u32 i_vrf_id;
+ u16 i_vrf;
+ u16 port;
+ u16 refresh_rate;
+ u16 timeout_rate;
+ u16 path_mtu;
+ u8 nfv9_global_collector;
+ u8 session_logging;
+} spp_api_cnat_v4_config_nfv9_logging_t;
+
+
+#endif
diff --git a/vnet/vnet/vcgn/cnat_db.h b/vnet/vnet/vcgn/cnat_db.h
new file mode 100644
index 00000000000..3596e2384e6
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_db.h
@@ -0,0 +1,701 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_db.h - translation database definitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_DB_H__
+#define __CNAT_DB_H__
+
+#include "cnat_cli.h"
+#include "cnat_ports.h"
+#include "index_list.h"
+
+#define VRF_NAME_LEN_STORED 12
+#define MAX_VRFID 400
+typedef struct _cnat_svi_params_entry {
+ u16 svi_type;
+ u16 pad;
+
+ u32 vrf_id;
+ u16 if_num;
+
+ u32 ipv6_addr[4];
+ u32 ipv4_addr;
+
+ u8 direction;
+ u32 tbl_id; /* vrf */
+ u32 vrf_override_id; /* tbl_id for override vrf */
+ u8 vrf_override_flag;
+ u8 partition_id;
+} cnat_svi_params_entry;
+
+typedef struct _cnat_ingress_vrfid_name_entry {
+ u32 vrf_id;
+ u16 ref_count; /*no# of serviceApps under a single vrf*/
+ u8 vrf_name[VRF_NAME_LEN_STORED];
+ u16 pad1;
+} cnat_ingress_vrfid_name_entry;
+#define HASH_ENHANCE 4
+
+#define CNAT_DB_SIZE (PLATFORM_MAX_NAT_ENTRIES / PLATFORM_CNAT_INSTS)
+#define CNAT_MAIN_HASH_SIZE (HASH_ENHANCE * PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE)
+#define CNAT_MAIN_HASH_MASK (CNAT_MAIN_HASH_SIZE-1)
+
+#define CNAT_USER_DB_SIZE (PLATFORM_MAX_USER_ENTRIES / PLATFORM_CNAT_INSTS)
+#define CNAT_USER_HASH_SIZE (HASH_ENHANCE * PLATFORM_CNAT_USER_PRELIM_HASH_SIZE)
+#define CNAT_USER_HASH_MASK (CNAT_USER_HASH_SIZE-1)
+
+#define CNAT_SESSION_DB_SIZE (PLATFORM_MAX_NAT_ENTRIES / PLATFORM_CNAT_INSTS)
+#define CNAT_SESSION_HASH_SIZE (HASH_ENHANCE * PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE)
+#define CNAT_SESSION_HASH_MASK (CNAT_SESSION_HASH_SIZE-1)
+
+
+#define CNAT_MAX_SESSIONS_PER_BIB 0xFFFF
+
+#define NUM_BITS_IN_UWORD (8*sizeof(uword))
+
+/* No. of per ip/port config will be limited to 1024 */
+#define CNAT_TIMEOUT_HASH_SIZE 1024
+#define CNAT_TIMEOUT_HASH_MASK (CNAT_TIMEOUT_HASH_SIZE - 1)
+#define CNAT_TIMEOUT_FULL_MASK 0xFFFFFFFFFFFFFFFF
+#define CNAT_TIMEOUT_IPPROT_MASK PLATFORM_CNAT_TIMEOUT_IPPROT_MASK
+#define CNAT_TIMEOUT_PORTPROT_MASK PLATFORM_CNAT_TIMEOUT_PORTPROT_MASK
+
+#define TRUE 1
+#define FALSE 0
+
+/*
+ * The key structure. All fields are in NETWORK byte order!
+ */
+typedef struct {
+ u32 ipv4;
+ u16 port;
+ u16 vrf; //bit0-12:vrf, bit13:unused, bit14-15:protocol
+} cnat_db_key_t;
+
+/* bit14-15:protocol in cnat_db_key_t */
+#define CNAT_INVALID_PROTO 0x0000
+#define CNAT_PPTP 0x0000
+#define CNAT_UDP 0x4000
+#define CNAT_TCP 0x8000
+#define CNAT_ICMP 0xc000
+#define CNAT_VRF_MASK 0x3fff
+#define CNAT_PRO_MASK 0xc000
+#define CNAT_PRO_SHIFT 14
+
+/*
+ * Maximum number of VRF entries supported
+ */
+#define CNAT_MAX_VRFMAP_ENTRIES (CNAT_VRF_MASK + 1)
+/*
+ * for hashing purposes, fetch the key in one instr.
+ */
+typedef union {
+ cnat_db_key_t k;
+ u64 key64;
+} cnat_key_t;
+
+typedef struct {
+ cnat_key_t k;
+ u32 bucket;
+} cnat_db_key_bucket_t;
+
+typedef struct {
+ u32 ipv6[4];
+ cnat_key_t ipv4_key;
+} dslite_key_t;
+
+typedef struct {
+/*
+ cnat_db_key_bucket_t ck;
+ u32 ipv6[4];
+*/
+ dslite_key_t dk;
+ u32 bucket;
+} dslite_db_key_bucket_t;
+
+
+/* Per port/ip timeout related strucutres */
+extern index_slist_t *cnat_timeout_hash;
+
+typedef struct {
+ cnat_key_t timeout_key;
+ u16 timeout_value;
+} cnat_timeout_t;
+
+typedef struct {
+ cnat_timeout_t t_key;
+ index_slist_t t_hash;
+} cnat_timeout_db_entry_t;
+
+extern cnat_timeout_db_entry_t *cnat_timeout_db;
+
+/*
+ * Main translation database entries. Currently 0x5A = 90 bytes in length.
+ * Given 20,000,000 entries, it saves nearly 1gb of SDRAM to pack the entries
+ * and pay the extra prefetch. So, that's what we do.
+ */
+
+typedef struct {
+ /* 0x00 */
+ index_slist_t out2in_hash; /* hash-and-chain, x2 */
+ index_slist_t in2out_hash;
+
+ /* 0x08 */
+ u16 flags; /* Always need flags... */
+#define CNAT_DB_FLAG_PORT_PAIR (1<<0)
+#define CNAT_DB_FLAG_TCP_ACTIVE (1<<1)
+#define CNAT_DB_FLAG_ENTRY_FREE (1<<2)
+#define CNAT_DB_FLAG_UDP_ACTIVE (1<<3)
+#define CNAT_DB_FLAG_STATIC_PORT (1<<4)
+/* This alg entry is set for FTP data connection */
+#define CNAT_DB_FLAG_ALG_ENTRY (1<<5)
+
+/* Will be set for TCP connection with destination port - 1723
+ * note - here CNAT_DB_FLAG_TCP_ACTIVE is also set */
+#define CNAT_DB_FLAG_PPTP_TUNNEL_INIT (1<<6)
+#define CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE (1<<7)
+
+/* for PPTP GRE packtes */
+#define CNAT_DB_FLAG_PPTP_GRE_ENTRY (1<<8)
+
+/* for PCP support */
+#define CNAT_DB_FLAG_PCPI (1<<9)
+#define CNAT_DB_FLAG_PCPE (1<<10)
+#define CNAT_PCP_FLAG (CNAT_DB_FLAG_PCPI | CNAT_DB_FLAG_PCPE)
+
+#define CNAT_TAC_SEQ_MISMATCH (1<<11)
+/* This alg entry is set for ftp control connection */
+#define CNAT_DB_FLAG_ALG_CTRL_FLOW (1<<12)
+
+/* This is for marking the state where connection is closing */
+#define CNAT_DB_FLAG_TCP_CLOSING (1<<13)
+
+#define CNAT_DB_DSLITE_FLAG (1<<14)
+#define CNAT_DB_NAT64_FLAG (1<<15)
+
+ /* 0x0A */
+ u16 vrfmap_index; /* index of vrfmap */
+
+ /* 0x0C */
+ u32 user_index; /* index of user that owns this entry */
+
+ /* 0x10 */
+ cnat_key_t out2in_key; /* network-to-user, outside-to-inside key */
+
+ /* 0x18 */
+ cnat_key_t in2out_key; /* user-to-network, inside-to-outside key */
+
+ /* 0x20 */
+ index_dlist_t user_ports; /* per-user translation list */
+
+ /* 0x28 */
+ u32 out2in_pkts; /* pkt counters */
+
+ /* 0x2C */
+ u32 in2out_pkts;
+
+ /* 0x30 */
+ u32 entry_expires; /* timestamp used to expire translations */
+
+ /* 0x34 */
+ union { /* used by FTP ALG, pkt len delta due to FTP PORT cmd */
+ u16 delta;
+ i8 alg_dlt[2]; /* two delta values, 0 for previous, 1 for current */
+ u16 il; /* Used to indicate if interleaved mode is used
+ in case of RTSP ALG */
+ } alg;
+
+ /* 0x36 */
+ u16 timeout;
+
+ /* 0x38 */
+ union {
+ struct seq_pcp_t {
+ u32 tcp_seq_num; /* last tcp (FTP) seq # that has pkt len change due to PORT */
+ u32 pcp_lifetime; /* peer and map life time value sent in reply*/
+ } seq_pcp;
+
+ /* This is for TCP seq check */
+ struct tcp_seq_chk_t {
+ u32 seq_no;
+ u32 ack_no;
+ } tcp_seq_chk;
+
+ /* used for pptp alg entries
+ 1. only tunnel : prev and next = 0xFFFFFFFF
+ 2. first gre entry : prev = tunnel db, next = next gre db
+ 3. last gre entry : prev = previous gre/tunnel db, next= 0xFFFFFFFF;
+
+ *while adding gre entry- updated at the begining of head
+ *while deleting gre entry - hash look up will be done and prev and next are adjusted
+ * while deleting need not traverse throufgh the list, as done in index_dlist_remelem
+
+ */
+ index_dlist_t pptp_list;
+
+ } proto_data;
+
+ /* 0x40 */
+ u32 dst_ipv4; /* pointer to ipv4 dst list, used in evil mode */
+
+ /* 0x44 */
+ u16 dst_port;
+
+ /* 0x46 */
+ u16 dslite_nat44_inst_id;
+
+ /* 0x48 */
+ u32 session_head_index;
+
+ /* 0x4C */
+ u16 nsessions;
+
+ /* 0x4E */
+ u8 unused;
+
+ /* 0x4F */
+ u8 scale;
+
+ /* 0x50 */
+ u32 diff_window;
+
+ /* Sizeof cnat_main_db_entry_t = 0x54 */
+} cnat_main_db_entry_t;
+
+/* Caution ...
+ * 1. The size of this structure should be same as that of
+ * nat64_bib_user_entry_t
+ * 2. Do not alter the position of first four fields
+ */
+typedef struct {
+ /* 0x00 */
+ index_slist_t user_hash; /* hash 'n chain bucket chain */
+
+ /* 0x04 */
+ u16 ntranslations; /* translations hold by this user */
+
+ /* 0x06 */
+ u8 icmp_msg_count; /* use to rate limit imcp send to this user */
+
+ /* 0x07 */
+ u8 flags; /* To identfiy whether it is NAT64 or NAT44 etc */
+#define CNAT_USER_DB_NAT44_FLAG 0
+#define CNAT_USER_DB_NAT64_FLAG 1
+#define CNAT_USER_DB_DSLITE_FLAG 2
+#define CNAT_USER_DB_PORT_LIMIT_EXCEEDED 0X80
+
+ /* 0x08 */
+ u32 translation_list_head_index;
+
+ /* 0x0C */
+ u32 portmap_index; /* index of bound port-map */
+
+ /* 0x10 */
+ cnat_key_t key; /* For dslite this should store IPv6 address */
+ u32 ipv6[4]; // B4 ipv6 address
+ /* 0x18 */
+#if 0
+ u32 temp1;
+ u32 temp2;
+ u32 temp3;
+#endif
+ /* 0x28 same as nat64_user_db */
+#ifndef NO_BULK_LOGGING
+ /* Now adding 8 more bytes for bulk allocation.. This makes it
+ * 0x30 (48). Added the same to nat64_bib_user_entry_t make the
+ * the sizes equal. For nat64 stful, we may support bulk allocation
+ * later.
+ */
+ /* Indicates the currently used bulk port range */
+ i16 bulk_port_range_cache[BULK_RANGE_CACHE_SIZE];
+#endif /* #ifndef NO_BULK_LOGGING */
+} cnat_user_db_entry_t;
+
+/*
+ * cnat_session_entry_t
+ * This structure represents the cnat session table. It maintains the
+ * information about the destination of a given translation (main db)
+ * There would be entry here only if packets are send to more than 1 destn
+ * from the same source.
+ */
+typedef struct {
+
+ /* 0x00 */
+ index_slist_t cnat_session_hash;
+
+ /* 0x04 */
+ u32 main_db_index; /* would point to v4 src transport address */
+
+ /* 0x08 */
+ cnat_key_t v4_dest_key;
+
+ /* 0x10 */
+ u16 flags; /* Same as cnat_main_db_t */
+
+ /* 0x12 */
+ u16 timeout;
+
+ /* 0x14 */
+ u32 entry_expires;
+ /* 0x18 */
+ index_dlist_t main_list;
+ /* 0x20 = 32 B */
+
+ union { /* alg same as cnat_main_db_t */
+ u16 delta;
+ i8 alg_dlt[2];
+ u16 il;
+ } alg;
+
+ /* 0x22 */
+ u16 tcp_flags;
+
+ /* 0x24 */
+ u32 tcp_seq_num;
+
+ /* 0x28 */
+ u32 ack_no;
+
+ /* 0x2C */
+ u32 window;
+
+ /* 0x30 */
+ u8 scale;
+
+ /* 0x31 */
+ u8 pad;
+
+ /* 0x32 */
+} cnat_session_entry_t;
+
+
+
+/*
+ * out2in and in2out hash bucket arrays are simply arrays of index_slist_t's
+ */
+
+typedef enum {
+ CNAT_DB_CREATE_DEFAULT=0, /* honor cnat_main_db_max_ports_per_user */
+ CNAT_DB_CREATE_OVERRIDE, /* just do it. */
+} cnat_db_create_policy_t;
+
+typedef struct {
+ cnat_key_t in2out_key;
+ cnat_key_t out2in_key;
+ u32 dst_ipv4; /* evil for mode only */
+ u16 cnat_instance;
+ cnat_portmap_t *portmap;
+ u16 *portmap_inuse;
+ cnat_main_db_entry_t *db;
+ cnat_db_create_policy_t policy;
+ port_pair_t pair_of_ports;
+} cnat_db_create_args_t;
+
+extern cnat_main_db_entry_t *cnat_main_db;
+extern cnat_user_db_entry_t *cnat_user_db;
+extern cnat_session_entry_t *cnat_session_db;
+
+#define S_WAO 0
+#define S_WA 1 /* waiting for address pool */
+#define S_WO 2 /* waiting for outside vrf */
+#define S_RUN 3 /* got everything */
+#define S_DEL 4 /* just delete */
+
+#define INVALID_UIDX 0xffff /*invalid svi app uidb index */
+#define INVALID_VRFID 0xffffffff /*invalid vrf id */
+
+typedef struct {
+ u16 status;
+ u16 tcp_mss; //tcp max segment size for this inside vrf */
+ u32 delete_time;
+ u16 i_vrf; //inside SVI uidx
+ u16 o_vrf; //outside SVI uidx
+ u32 i_vrf_id; //inside vrf id
+ u32 o_vrf_id; //outside vrf id
+ cnat_portmap_v2_t *portmap_list;
+ u32 nfv9_logging_index;
+ u32 syslog_logging_index;
+ u16 ip_n_to_1;
+#ifndef NO_BULK_LOGGING
+ bulk_alloc_size_t bulk_size;
+#endif /* #ifndef NO_BULK_LOGGING */
+ u32 pcp_server_addr;
+ u32 pcp_server_port;
+
+ u8 nf_logging_policy;
+ u8 syslog_logging_policy;
+ u8 frag_tout;
+ u32 rseed_ip;
+ u16 port_limit;
+ u8 tcp_seq_check_enable;
+ u8 pad;
+ u32 tcp_seq_user_window;
+ u8 filter_policy;
+ u8 ignore_port;
+} cnat_vrfmap_t;
+
+/*
+ * When creating cnat_vrfmap entry, ensure that any already
+ * configured logging info is taken into account
+ */
+#define CNAT_SET_VRFMAP_NFV9_LOGGING_INDEX(logging_index, i_vrf) \
+do { \
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info = 0; \
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info, ({ \
+ if (my_nfv9_logging_info->i_vrf == i_vrf) { \
+ logging_index = my_nfv9_logging_info - cnat_nfv9_logging_info; \
+ break; \
+ } \
+ })); \
+while (0)
+
+
+typedef struct {
+ /*
+ * spp_ctx_alloc() call failed
+ */
+ u64 nfv9_logging_context_creation_fail_count;
+
+ /*
+ * Cannot send the existing logging pkt, so cannot create
+ * any additional packets for logging purposes
+ */
+ u64 nfv9_logging_context_creation_deferred_count;
+
+ /*
+ * Cannot send the existing logging pkt due to cnat_rewrite_output
+ * superframe being full.
+ */
+ u64 nfv9_downstream_constipation_count;
+
+ /*
+ * buffer for spp_ctx_alloc() call failed
+ */
+ u64 nfv9_logging_context_buffer_allocation_fail_count;
+
+} cnat_global_counters_t;
+
+
+extern cnat_global_counters_t cnat_global_counters;
+
+extern u16 *cnat_portmap_indices_by_vrf;
+extern cnat_vrfmap_t *cnat_portmap_by_vrf;
+extern cnat_portmap_t **cnat_portmaps;
+extern u16 **cnat_portmaps_inuse;
+
+extern cnat_vrfmap_t *cnat_map_by_vrf;
+
+/*
+ * Special define to indicate that the VRF map index entry is empty
+ */
+#define VRF_MAP_ENTRY_EMPTY 0xffff
+extern u16 vrf_map_array[CNAT_MAX_VRFMAP_ENTRIES];
+
+extern cnat_svi_params_entry svi_params_array[CNAT_MAX_VRFMAP_ENTRIES];
+extern cnat_ingress_vrfid_name_entry vrfid_name_map[MAX_VRFID];
+
+extern index_slist_t *cnat_out2in_hash;
+extern index_slist_t *cnat_in2out_hash;
+extern index_slist_t *cnat_user_hash;
+extern index_slist_t *cnat_session_hash;
+
+typedef enum {
+ CNAT_DB_IN2OUT = 0,
+ CNAT_DB_OUT2IN,
+} cnat_db_which_t;
+
+typedef enum {
+ CNAT_NO_ICMP_MSG =0,
+ CNAT_ICMP_MSG,
+} cnat_icmp_msg_t;
+
+typedef struct {
+ cnat_errno_t error;
+ cnat_icmp_msg_t gen_icmp_msg;
+ u32 svi_addr;
+} cnat_gen_icmp_info;
+
+typedef cnat_vrfmap_t nat64_vrfmap_t;
+typedef cnat_portmap_v2_t nat64_portmap_v2_t;
+
+#define CNAT_V4_GET_HASH(key64, hash, mask) \
+ a = key64; \
+ b = c = 0x9e3779b97f4a7c13LL; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix64(a, b, c); \
+ hash = c & mask;
+
+#define CNAT_V4_GET_SESSION_HASH(main_index, in_addr, port, vrf, hash, mask) \
+ a = main_index ^ in_addr ^ port ^ vrf; \
+ b = c = 0x9e3779b9; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask;
+
+#define CNAT_V4_GET_FRAG_HASH(key64, key32, hash, mask) \
+ a = key64; \
+ b = key32; \
+ c = 0x9e3779b97f4a7c13LL; \
+ hash_mix64(a, b, c); \
+ hash = c % mask;
+
+#define CNAT_DB_UPDATE_IN2OUT_TIMER \
+ db->entry_expires = cnat_current_time; \
+ db->in2out_pkts++;
+
+#define CNAT_DB_TIMEOUT_RST(db) \
+ if(PREDICT_TRUE(db->entry_expires != 0 )) \
+ db->entry_expires = cnat_current_time;
+
+#define DEBUG_I2O_DROP(debug_flag) \
+if (debug_i_flag & debug_flag) { \
+ cnat_db_debug_i2o_drop(&ki); \
+}
+
+
+cnat_main_db_entry_t *cnat_main_db_create (cnat_db_create_args_t *a);
+void cnat_main_db_entry_delete(cnat_main_db_entry_t *ep);
+
+void cnat_delete_main_db_entry(cnat_main_db_entry_t *ep);
+void cnat_delete_main_db_entry_v2(cnat_main_db_entry_t *ep);
+
+
+cnat_main_db_entry_t*
+cnat_get_main_db_entry(cnat_db_key_bucket_t *ki,
+ port_pair_t port_type,
+ cnat_errno_t *error,
+ cnat_user_db_entry_t ** user_db_entry);
+
+cnat_main_db_entry_t*
+cnat_get_main_db_entry_v2(cnat_db_key_bucket_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info,
+ cnat_key_t *dest_info);
+
+cnat_main_db_entry_t*
+cnat_create_static_main_db_entry_v2(cnat_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_vrfmap_t *my_vrfmap,
+ cnat_gen_icmp_info *info);
+
+cnat_main_db_entry_t*
+cnat_create_main_db_entry_and_hash(cnat_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb);
+
+cnat_user_db_entry_t*
+cnat_user_db_create_entry(cnat_db_key_bucket_t *uki,
+ u32 portmap_index);
+
+cnat_user_db_entry_t*
+cnat_user_db_lookup_entry(cnat_db_key_bucket_t *uki);
+
+cnat_main_db_entry_t*
+cnat_main_db_lookup_entry(cnat_db_key_bucket_t *ki);
+
+cnat_main_db_entry_t*
+cnat_main_db_lookup_entry_out2in (cnat_db_key_bucket_t *ko);
+
+void cnat_main_db_entry_dump (cnat_main_db_entry_t *db);
+void cnat_db_in2out_hash_delete (cnat_main_db_entry_t *ep, cnat_user_db_entry_t *up);
+void cnat_db_out2in_hash_delete (cnat_main_db_entry_t *ep);
+void cnat_user_db_delete (cnat_user_db_entry_t *up);
+void cnat_db_debug_i2o_drop(cnat_db_key_bucket_t *ki);
+
+/*
+ * Function to dump the Hash Table that maps if_num to uidb_index
+ */
+extern void cnat_if_num_hash_table_dump(void);
+
+#define MAIN_DB_TYPE 0
+#define SESSION_DB_TYPE 1
+u16 query_and_update_db_timeout(void *db, u8 db_type);
+
+u16 cnat_timeout_db_create (cnat_timeout_t t_entry);
+void cnat_timeout_db_delete(cnat_key_t t_key);
+
+cnat_session_entry_t *
+cnat_create_session_db_entry(cnat_key_t *ko,
+ cnat_main_db_entry_t *bdb, u8 log);
+
+void cnat_dest_update_main2session(cnat_main_db_entry_t *mdb,
+ cnat_session_entry_t *sdb);
+
+cnat_session_entry_t *cnat_handle_1to2_session(
+ cnat_main_db_entry_t *mdb,
+ cnat_key_t *dest_info);
+
+void cnat_add_dest_n_log(
+ cnat_main_db_entry_t *mdb,
+ cnat_key_t *dest_info);
+
+cnat_session_entry_t *
+ cnat_session_db_lookup_entry(cnat_key_t *ko,u32 main_db_index);
+
+cnat_session_entry_t *
+ cnat_session_db_edm_lookup_entry(cnat_key_t *ko,
+ u32 session_head_index,
+ u32 main_db_index);
+
+
+typedef struct{
+ u32 sessions;
+ u32 active_translations;
+ u32 num_dynamic_translations;
+ u32 num_static_translations;
+ u64 in2out_drops_port_limit_exceeded;
+ u64 in2out_drops_system_limit_reached;
+ u64 in2out_drops_resource_depletion;
+ u64 no_translation_entry_drops;
+ u32 num_subscribers;
+ u32 dummy;
+ u64 drops_sessiondb_limit_exceeded;
+} nat44_dslite_common_stats_t;
+
+typedef struct {
+ u32 translation_delete_count;
+ u32 translation_create_count;
+ u32 out2in_forwarding_count;
+} nat44_dslite_global_stats_t;
+
+typedef struct {
+ u64 v4_to_v6_tcp_seq_mismatch_drop_count;
+ u64 v4_to_v6_tcp_seq_mismatch_count;
+ u64 v4_to_v6_out2in_session_create_count;
+ u64 v4_to_v6_end_point_filter_drop_count;
+} nat44_counters_stats_t;
+
+#define NAT44_STATS 0
+#define DSLITE_STATS 1
+extern nat44_dslite_common_stats_t nat44_dslite_common_stats[255]; /* 0 is for nat44 */
+extern nat44_dslite_global_stats_t nat44_dslite_global_stats[2]; /* 0 for nat44 and 1 for dslite */
+extern nat44_counters_stats_t nat44_counters_stats[CNAT_MAX_VRFMAP_ENTRIES];/*For displaying show cgn <cgn-name> inside-vrf <vrf-name> counters */
+
+#define NAT44_COMMON_STATS nat44_dslite_common_stats[NAT44_RESERVED_INST_ID]
+#define NAT44_GLOBAL_STATS nat44_dslite_global_stats[NAT44_STATS]
+#define DSLITE_GLOBAL_STATS nat44_dslite_global_stats[DSLITE_STATS]
+#define SESSION_LOG_ENABLE 1
+#define ALG_ENABLED_DB(db) \
+ ((db->flags & CNAT_PCP_FLAG) || \
+ (db->flags & CNAT_DB_FLAG_ALG_CTRL_FLOW) || \
+ (db->flags & (CNAT_DB_FLAG_PPTP_TUNNEL_INIT | \
+ CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE)))
+
+
+#endif /* __CNAT_DB_H__ */
diff --git a/vnet/vnet/vcgn/cnat_db_scanner.c b/vnet/vnet/vcgn/cnat_db_scanner.c
new file mode 100644
index 00000000000..6e536d84c79
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_db_scanner.c
@@ -0,0 +1,493 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_db_scanner.c - cnat_db_scanner dispatch function and initialization
+ *
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+#include <vppinfra/string.h>
+#include <vppinfra/random.h>
+#include <vppinfra/fifo.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/format.h>
+
+
+#include "cnat_db.h"
+#include "cnat_logging.h"
+#include "cnat_global.h"
+#include "cnat_ipv4_udp.h"
+#include "cnat_common_api.h"
+
+u32 translation_create_count, translation_delete_count;
+u32 translation_create_rate, translation_delete_rate;
+
+u32 in2out_forwarding_count, out2in_forwarding_count;
+u32 in2out_forwarding_rate, out2in_forwarding_rate;
+
+u32 nat44_active_translations;
+u32 num_entries;
+uword check_these_pool_indices[2*MAX_DB_ENTRY_SELECTED_PER_SCAN];
+
+#define CNAT_DB_SCANNER_TURN_ON 5 /* just an arbitary number for easier debugging */
+
+//extern u32 pcp_throttle_count;
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_db_scanner_main_t;
+
+cnat_db_scanner_main_t cnat_db_scanner_main;
+
+
+static inline void check_session_for_expiry(
+ cnat_session_entry_t * sdb, u8 timeout_dirty
+ /*,dslite_table_entry_t *dslite_entry_ptr*/)
+{
+ void cnat_delete_session_db_entry (cnat_session_entry_t *ep, u8 log);
+ /* Tasks -
+ * 1. Check for expiry for this entry
+ * 2. Delete if expired
+ */
+ u32 timeout = 0;
+
+ switch(sdb->v4_dest_key.k.vrf & CNAT_PRO_MASK) {
+ case CNAT_TCP:
+ if (sdb->flags & CNAT_DB_FLAG_TCP_ACTIVE) {
+ timeout = sdb->timeout;
+ if(PREDICT_FALSE(timeout_dirty)) {
+ timeout = query_and_update_db_timeout(
+ (void *)sdb, SESSION_DB_TYPE);
+ }
+ if(PREDICT_TRUE(timeout == 0)) {
+ timeout = tcp_active_timeout;
+ //dslite_entry_ptr->timeout_info.tcp_active_timeout;
+ }
+ } else {
+ timeout = tcp_initial_setup_timeout;
+ //dslite_entry_ptr->timeout_info.tcp_initial_setup_timeout;
+ }
+ break;
+ case CNAT_UDP:
+ if (sdb->flags & CNAT_DB_FLAG_UDP_ACTIVE) {
+ timeout = sdb->timeout;
+ if(PREDICT_FALSE(timeout_dirty)) {
+ timeout = query_and_update_db_timeout(
+ (void *)sdb, SESSION_DB_TYPE);
+ }
+
+ if(PREDICT_TRUE(timeout == 0)) {
+ timeout = udp_act_session_timeout;
+ //dslite_entry_ptr->timeout_info.udp_act_session_timeout;
+ }
+ } else {
+ timeout = udp_init_session_timeout;
+ //dslite_entry_ptr->timeout_info.udp_init_session_timeout;
+ }
+ break;
+ case CNAT_ICMP:
+ timeout = icmp_session_timeout;
+ //dslite_entry_ptr->timeout_info.icmp_session_timeout;
+ break;
+ case CNAT_PPTP:
+ timeout = pptp_cfg.timeout;
+ break;
+ default:
+ return;
+ }
+ /* Changes required for clearing sessions */
+ if (PREDICT_FALSE((sdb->entry_expires == 0) ||
+ (sdb->entry_expires + timeout < cnat_current_time))) {
+ cnat_delete_session_db_entry(sdb, TRUE);
+ }
+}
+
+static u8 handle_db_scan_for_sessions(
+ cnat_main_db_entry_t *db, int *dirty_index, uword db_index
+ /* ,dslite_table_entry_t *dslite_entry_ptr */)
+{
+ /* Tasks -
+ * 1. Traverse through the sessions and check for timeouts
+ * 2. Delete sessions that have exipred
+ * 3. Check if the db has only one session remaining.. if so,
+ * the details of the session has to be moved to main db
+ * and session db entry needs to be freed
+ * 4. If db does not have any sessions left, the db itself
+ * needs to be deleted.
+ */
+ u32 nsessions, session_index_head, session_index;
+ cnat_session_entry_t *sdb;
+ u8 timeout_dirty = FALSE;
+
+ if(PREDICT_FALSE(*dirty_index == db_index)) {
+ *dirty_index = -1;
+ }
+ if(PREDICT_FALSE(timeout_dirty_flag == 1)) {
+ timeout_dirty_flag = 0;
+ *dirty_index = db_index;
+ timeout_dirty = TRUE;
+ }
+
+ session_index_head = session_index = db->session_head_index;
+ nsessions = db->nsessions;
+
+ do {
+ sdb = cnat_session_db + session_index;
+ if(PREDICT_FALSE(!sdb)) {
+ //TO DO: Debug msg?
+ return FALSE;
+ }
+ session_index = sdb->main_list.next;
+ check_session_for_expiry(sdb, timeout_dirty /*,dslite_entry_ptr*/);
+ nsessions--; /* To ensure that we do not get in to an infinite loop */
+ } while(session_index != session_index_head
+ && db->session_head_index != EMPTY &&
+ nsessions);
+
+ /* Note.. the code below assumes that while deleting the
+ * sessions, we do not delete the main db entry if it does
+ * not have any sessions anymore
+ */
+ if(PREDICT_FALSE((!db->nsessions) &&
+ (!(db->flags & CNAT_DB_FLAG_STATIC_PORT)))) {
+ cnat_delete_main_db_entry_v2(db);
+ return TRUE; /* to indicate that main db was deleted */
+ }
+ return FALSE;
+}
+
+static void cnat_db_scanner(void)
+{
+ cnat_main_db_entry_t * db;
+ u32 timeout;
+ cnat_vrfmap_t *my_vrfmap __attribute__((unused)) = 0;
+ static int dirty_index = -1;
+ u16 instance __attribute__((unused));
+ //dslite_table_entry_t *dslite_entry_ptr;
+ u32 i;
+ uword db_index;
+ //pcp_throttle_count = 0;
+
+ for (i = 0; i < num_entries; i++) {
+ db_index = check_these_pool_indices[i];
+ db = cnat_main_db + db_index;
+ timeout=0;
+ my_vrfmap = 0;
+
+#if 0
+ if(PREDICT_FALSE(db->flags & CNAT_PCP_FLAG)) {
+
+ if(db->proto_data.seq_pcp.pcp_lifetime < cnat_current_time) {
+ /* mark as implicit */
+ db->flags &= ~CNAT_PCP_FLAG;
+ }
+ continue;
+ }
+
+#endif
+ if(PREDICT_FALSE(db->nsessions > 1)) {
+ if(PREDICT_FALSE(
+ handle_db_scan_for_sessions(db, &dirty_index, db_index /*,dslite_entry_ptr */))) {
+ continue;
+ } else if(PREDICT_TRUE(db->nsessions > 1)) {
+ continue;
+ }
+ /* if there is exactly one dest left.. let it fall through
+ * and check if that needs to be deleted as well
+ */
+ }
+
+#if 0
+ if (PREDICT_FALSE(db->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ if (PREDICT_FALSE(db->flags & CNAT_DB_DSLITE_FLAG)) {
+ if(PREDICT_FALSE(
+ ((dslite_entry_ptr->nf_logging_policy != SESSION_LOG_ENABLE) &&
+ (dslite_entry_ptr->syslog_logging_policy != SESSION_LOG_ENABLE))
+ || (db->nsessions !=1))) {
+ continue;
+ }
+ } else {
+ my_vrfmap = cnat_map_by_vrf + db->vrfmap_index;
+ if(PREDICT_FALSE(
+ ((my_vrfmap->nf_logging_policy != SESSION_LOG_ENABLE) &&
+ (my_vrfmap->syslog_logging_policy != SESSION_LOG_ENABLE)) ||
+ (db->nsessions !=1))) {
+ continue;
+ }
+ }
+ }
+#endif
+
+ switch(db->in2out_key.k.vrf & CNAT_PRO_MASK) {
+ case CNAT_TCP:
+ if (db->flags & CNAT_DB_FLAG_TCP_ACTIVE) {
+ timeout = db->timeout;
+ if(PREDICT_FALSE(dirty_index == db_index)) {
+ dirty_index = -1;
+ }
+ if(PREDICT_FALSE(timeout_dirty_flag == 1)) {
+ timeout_dirty_flag = 0;
+ dirty_index = db_index;
+ }
+ if(PREDICT_FALSE(dirty_index != -1)) {
+ timeout = query_and_update_db_timeout(
+ (void *)db, MAIN_DB_TYPE);
+ }
+ if(PREDICT_TRUE(timeout == 0)) {
+ timeout = tcp_active_timeout;
+ }
+ } else {
+ timeout = tcp_initial_setup_timeout;
+ }
+ break;
+ case CNAT_UDP:
+ if (db->flags & CNAT_DB_FLAG_UDP_ACTIVE) {
+ timeout = db->timeout;
+ if(PREDICT_FALSE(dirty_index == db_index)) {
+ dirty_index = -1;
+ }
+ if(PREDICT_FALSE(timeout_dirty_flag == 1)) {
+ timeout_dirty_flag = 0;
+ dirty_index = db_index;
+ }
+ if(PREDICT_FALSE(dirty_index != -1)) {
+ timeout = query_and_update_db_timeout(
+ (void *)db, MAIN_DB_TYPE);
+ }
+ if(PREDICT_TRUE(timeout == 0)) {
+ timeout = udp_act_session_timeout;
+ }
+ } else {
+ timeout = udp_init_session_timeout;
+ }
+ break;
+ case CNAT_ICMP:
+ timeout = icmp_session_timeout;
+ break;
+ case CNAT_PPTP:
+ timeout = pptp_cfg.timeout;
+ break;
+ default:
+ continue;
+ }
+
+
+ /* Ref: CSCtu97536 */
+ if (PREDICT_FALSE((db->entry_expires == 0) ||
+ (db->entry_expires + timeout < cnat_current_time))) {
+#if 0
+ if (PREDICT_FALSE(db->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ if (PREDICT_FALSE(db->flags & CNAT_DB_DSLITE_FLAG)) {
+ instance = db->dslite_nat44_inst_id;
+ } else {
+ instance = NAT44_RESERVED_INST_ID;
+ cnat_session_log_nat44_mapping_delete(db, 0, my_vrfmap);
+ }
+
+ /* Reset the session details */
+ db->nsessions = 0;
+ db->dst_ipv4 = 0;
+ db->dst_port = 0;
+ db->flags &= ~(CNAT_DB_FLAG_TCP_ACTIVE | CNAT_DB_FLAG_UDP_ACTIVE
+ | CNAT_DB_FLAG_ALG_ENTRY);
+ db->timeout = 0;
+ db->entry_expires = 0;
+ db->alg.delta = 0;
+ db->proto_data.seq_pcp.tcp_seq_num = 0;
+ continue;
+ }
+#endif
+ //printf("DELETING DB ENTRY FOR 0x%x\n", db->in2out_key.k.ipv4);
+ cnat_delete_main_db_entry_v2(db);
+ }
+ //free(check_these_pool_indices[i]);
+ }
+}
+
+static void walk_the_db (void)
+{
+ pool_header_t *h = pool_header(cnat_main_db);
+ u32 db_uword_len;
+ static u32 base_index = 0, free_bitmap_index = 0;
+ int bits_scanned = 0, i;
+ uword inuse_bitmap;
+
+ num_entries=0;
+
+ /* Across all db entries... */
+ db_uword_len = vec_len(cnat_main_db) / NUM_BITS_IN_UWORD;
+ if (PREDICT_FALSE(vec_len(cnat_main_db) % NUM_BITS_IN_UWORD)) {
+ /*
+ * It should not come here as in cnat_db_init_v2()
+ * it is made multiple of NUM_BITS_IN_UWORD
+ */
+ ASSERT(0);
+ return ;
+ }
+
+ if (PREDICT_FALSE(! db_uword_len))
+ return ;
+
+ while (bits_scanned < MAX_DB_ENTRY_PER_SCAN) {
+
+ if (PREDICT_FALSE(free_bitmap_index < vec_len(h->free_bitmap))) {
+
+ /* free_bitmap exists and it is not all 0 */
+
+ inuse_bitmap = ~(h->free_bitmap[free_bitmap_index]);
+ i = 0;
+ while (inuse_bitmap) {
+
+ /* Check to see if the index is in use */
+ if (PREDICT_FALSE((inuse_bitmap >> i) & 1)) {
+ check_these_pool_indices[num_entries] = base_index + i;
+ inuse_bitmap &= ~((uword) 1 << i);
+ num_entries++;
+ }
+ i++;
+ } // while (inuse_bitmap)
+ } else {
+
+ /*
+ * 64-bit entry is 0, means all 64 entries are allocated.
+ * So, simply add all 64 entries here.
+ * No need to form inuse_bitmap, check and reset bits
+ */
+ for (i=0; i<NUM_BITS_IN_UWORD; i++) {
+
+ check_these_pool_indices[num_entries] = base_index + i;
+ num_entries++;
+ }
+ } // if (free_bitmap_index < vec_len(h->free_bitmap))
+
+ /* Update free_bitmap_index and base_index for next run */
+ if (PREDICT_FALSE(free_bitmap_index == db_uword_len - 1)) {
+ /* wrap-around for next run */
+ free_bitmap_index = 0;
+ base_index = 0;
+ } else {
+ free_bitmap_index ++;
+ base_index += NUM_BITS_IN_UWORD;
+ }
+
+ /* increment # of bits scanned */
+ bits_scanned += NUM_BITS_IN_UWORD;
+
+ /* Found enough entries to check ? */
+ if (PREDICT_FALSE(num_entries >= MAX_DB_ENTRY_SELECTED_PER_SCAN))
+ {
+ /* This check is introduced to keep fixed MAX scan entry value */
+ /* This is very much required when we do scanning for NAT64 */
+ /* please check comments in cnat_db_scanner() &
+ * handler_nat64_db_scanner() */
+ if (num_entries >= MAX_COMBINED_DB_ENTRIES_PER_SCAN) {
+ num_entries = MAX_COMBINED_DB_ENTRIES_PER_SCAN;
+ }
+ break;
+ }
+
+ } // while (bits_scanned < MAX_DB_ENTRY_PER_SCAN)
+
+ if (PREDICT_FALSE(num_entries > 0)) {
+ //printf("%s: num_entries [%d]\n", __func__, num_entries);
+ cnat_db_scanner();
+ }
+ return ;
+}
+
+static uword cnat_db_scanner_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ f64 timeout = 0.01; /* timeout value in sec (10 ms) */
+ static u8 timeout_count = 0;
+
+ uword event_type;
+ uword * event_data = 0;
+ /* Wait until vCGN is configured */
+ while (1) {
+ /* Assigning a huge timeout value, vCGN may or
+ * may not get configured within this timeout */
+ vlib_process_wait_for_event_or_clock (vm, 1e9);
+ event_type = vlib_process_get_events (vm, &event_data);
+
+ /* check whether the process is waken up by correct guy,
+ * otherwise continue waiting for the vCGN config */
+ if (event_type == CNAT_DB_SCANNER_TURN_ON) {
+ break;
+ }
+ }
+
+ while(1) {
+ vlib_process_suspend(vm, timeout);
+
+ /* Above suspend API should serve the purpose, no need to invoke wait API */
+ /* vlib_process_wait_for_event_or_clock (vm, timeout); */
+
+ /* Lets make use of this timeout for netflow packet sent */
+ if (timeout_count < 100) { /* 100*10 ms = 1 sec */
+ timeout_count++;
+ } else {
+ if (nfv9_configured) {
+ handle_pending_nfv9_pkts();
+ }
+ timeout_count = 0;
+ }
+ /* Do we need this ? */
+ //event_type = vlib_process_get_events (vm, &event_data);
+ cnat_current_time = (u32)vlib_time_now (vm);
+ if (cnat_db_init_done) {
+ walk_the_db();
+ }
+ }
+
+ return 0;
+}
+
+
+VLIB_REGISTER_NODE (cnat_db_scanner_node) = {
+ .function = cnat_db_scanner_fn,
+ .type = VLIB_NODE_TYPE_PROCESS,
+ .name = "cnat-db-scanner",
+ .process_log2_n_stack_bytes = 18,
+};
+
+clib_error_t *cnat_db_scanner_init (vlib_main_t *vm)
+{
+ cnat_db_scanner_main_t *mp = &cnat_db_scanner_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+void cnat_scanner_db_process_turn_on(vlib_main_t *vm)
+{
+ vlib_process_signal_event (vm, cnat_db_scanner_node.index,
+ CNAT_DB_SCANNER_TURN_ON, 0);
+ return;
+}
+
+VLIB_INIT_FUNCTION (cnat_db_scanner_init);
+
diff --git a/vnet/vnet/vcgn/cnat_db_v2.c b/vnet/vnet/vcgn/cnat_db_v2.c
new file mode 100644
index 00000000000..c09a73ebb15
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_db_v2.c
@@ -0,0 +1,3716 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_db_v2.c - translation database definitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/error.h>
+
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_v4_functions.h"
+#include "cnat_log_api.h"
+#include "cnat_cli.h"
+#include "spp_platform_trace_log.h"
+#include "cnat_bulk_port.h"
+#include "nat64_db.h"
+#include "dslite_db.h"
+#include "cnat_config_api.h"
+
+#define HASH_TABLE_SIZE 8192 // hash table size
+#define THROTTLE_TIME 180 // throttle time value for out of port msg/user
+
+u8 cnat_db_init_done = 0;
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_db_v2_main_t;
+
+cnat_db_v2_main_t cnat_db_v2_main;
+
+#if 1
+/* TOBE_PORTED : Remove the following once fixed */
+#undef PREDICT_TRUE
+#undef PREDICT_FALSE
+#define PREDICT_TRUE(x) (x)
+#define PREDICT_FALSE(x) (x)
+#endif
+
+#define foreach_cnat_db_v2_error \
+_(DROP, "error-drop packets")
+
+typedef enum {
+#define _(sym,str) CNAT_DB_V2_##sym,
+ foreach_cnat_db_v2_error
+#undef _
+ CNAT_DB_V2_N_ERROR,
+} cnat_db_v2_error_t;
+
+static char * cnat_db_v2_error_strings[] __attribute__((unused)) = {
+#define _(sym,string) string,
+ foreach_cnat_db_v2_error
+#undef _
+};
+
+
+void cnat_table_entry_fill_map(u32 start_addr, u32 end_addr,
+ cnat_portmap_v2_t **port_map_holder)
+{
+ u32 this_start_addr, this_end_addr, this_addr, new;
+ u32 loop_count;
+ u32 pm_len, i;
+ cnat_portmap_v2_t *my_pm =0;
+ cnat_portmap_v2_t *pm = 0;
+
+ my_instance_number = 0;
+
+ this_start_addr = start_addr;
+ this_end_addr = end_addr;
+
+ /*
+ * How many new addresses are getting added ??
+ */
+ /* commenting this. Right now end - start will be for this vCGN instance */
+ //new = ((this_end_addr - this_start_addr) / MAX_CORES_PER_PARTITION) + 1;
+ new = (this_end_addr - this_start_addr) + 1;
+
+ pm = *port_map_holder;
+ pm_len = vec_len(pm);
+#if DEBUG_NOT_COMMENTED
+ printf("this_start_addr = 0x%08X, this_end_addr = 0x%08X, Num Addr = %d\n",
+ this_start_addr, this_end_addr, new);
+ printf("pm_len = %d\n", pm_len);
+#endif
+ /* Check whether the address pool add requested already exists */
+ my_pm = pm;
+ for(i = 0; i< pm_len; i++) {
+ if(my_pm->ipv4_address == this_start_addr) {
+ printf("address pool with addr 0x%08X exists\n", this_start_addr);
+ return;
+ }
+ my_pm++;
+ }
+
+ /*
+ * For now give a warning message only....
+ */
+#if 0
+ if ((total_address_pool_allocated + new) >
+ CNAT_MAX_ADDR_POOL_SIZE_PER_CORE) {
+ printf("address pool size (%d) would cross permissible limit (%u) \n",
+ (total_address_pool_allocated + new),
+ CNAT_MAX_ADDR_POOL_SIZE_PER_CORE);
+ }
+#endif
+
+ total_address_pool_allocated += new;
+ vec_add2(pm, my_pm, new);
+
+#if DEBUG_NOT_COMMENTED
+ printf("total_address_pool_allocated changed from %d to %d (added %d)",
+ (total_address_pool_allocated - new),
+ total_address_pool_allocated, new);
+ printf("vec add is ok\n");
+#endif
+
+ memset(my_pm, 0, new*sizeof(*my_pm));
+ this_addr = this_start_addr;
+ loop_count = 0; /* Sanity counter */
+
+ while (this_addr <= this_end_addr) {
+#if DEBUG_NOT_COMMENTED
+ printf("loop %d: this addr = 0x%08X\n", loop_count+1, this_addr);
+#endif
+ my_pm->ipv4_address = this_addr;
+ /*
+ * Set all bits to "1" indicating all ports are free
+ */
+ memset(my_pm->bm, 0xff,
+ (((BITS_PER_INST + BITS(uword)-1)/BITS(uword))*(sizeof(uword))));
+ //this_addr += MAX_CORES_PER_PARTITION;
+ this_addr += 1;
+ my_pm++;
+ loop_count++;
+ }
+ /*
+ * We should have loop_count same as the new value
+ */
+ if (loop_count != new) {
+ printf("Mismatch in loop_count (%d) != new (%d)\n",
+ loop_count, new);
+ }
+
+ *port_map_holder = pm;
+
+#if DEBUG_NOT_COMMENTED
+ printf("revised pm len %d\n", vec_len(*port_map_holder));
+#endif
+
+ return;
+}
+
+
+void cnat_delete_session_db_entry (cnat_session_entry_t *ep, u8 log);
+inline void handle_cnat_port_exceeded_logging(
+ cnat_user_db_entry_t *udb,
+ cnat_key_t * key,
+ cnat_vrfmap_t *vrfmap);
+
+cnat_global_counters_t cnat_global_counters;
+u32 last_log_timestamp = 0;
+u32 last_user_dyn_port_exc_timestamp = 0;
+u32 last_user_stat_port_exc_timestamp = 0;
+
+index_slist_t *cnat_out2in_hash;
+index_slist_t *cnat_in2out_hash;
+index_slist_t *cnat_user_hash;
+index_slist_t *cnat_timeout_hash;
+index_slist_t *cnat_session_hash;
+
+cnat_main_db_entry_t *cnat_main_db;
+cnat_user_db_entry_t *cnat_user_db;
+cnat_session_entry_t *cnat_session_db;
+cnat_timeout_db_entry_t *cnat_timeout_db;
+
+cgse_nat_db_entry_t *cgse_nat_db;
+cgse_nat_user_db_entry_t *cgse_user_db;
+cgse_nat_session_db_entry_t *cgse_session_db;
+
+nat44_dslite_common_stats_t nat44_dslite_common_stats[255]; /* 0 is for nat44 */
+nat44_dslite_global_stats_t nat44_dslite_global_stats[2]; /* 0 for nat44 and 1 for dslite */
+nat44_counters_stats_t nat44_counters_stats[CNAT_MAX_VRFMAP_ENTRIES];
+/*For displaying show cgn <cgn-name> inside-vrf <vrf-name> counters */
+
+/*
+ * This is the pool of vrf map structures used by latest main-db functions
+ */
+cnat_vrfmap_t *cnat_map_by_vrf;
+
+/*
+ * Have a mapping table of vrf_id-->vrf_map_index
+ * This helps in easily getting the vrf_map structure during
+ * main-db create paths
+ */
+u16 vrf_map_array[CNAT_MAX_VRFMAP_ENTRIES];
+cnat_svi_params_entry svi_params_array[CNAT_MAX_VRFMAP_ENTRIES];
+cnat_ingress_vrfid_name_entry vrfid_name_map[MAX_VRFID] = {{0}};
+u64 in2out_drops_port_limit_exceeded;
+u64 in2out_drops_system_limit_reached;
+u64 in2out_drops_resource_depletion;
+u64 no_translation_entry_drops;
+u32 no_sessions;
+
+#define CNAT_SET_ICMP_MSG_INFO \
+if (PREDICT_TRUE((my_vrfmap->i_vrf < CNAT_MAX_VRFMAP_ENTRIES) && \
+ (svi_params_array[my_vrfmap->i_vrf].ipv4_addr))) { \
+ info->gen_icmp_msg = icmp_msg_gen_allowed(); \
+ info->svi_addr = svi_params_array[my_vrfmap->i_vrf].ipv4_addr; \
+}
+
+#define CNAT_DEBUG_INSIDE_ERR(err) \
+if (((protocol == CNAT_UDP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_UDP)) || \
+ ((protocol == CNAT_TCP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_TCP)) || \
+ ((protocol == CNAT_ICMP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_ICMP))) { \
+ cnat_db_debug_error(&u_ki, err); \
+}
+
+#define DSLITE_DEBUG_INSIDE_ERR(err) \
+if (((protocol == CNAT_UDP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_UDP)) || \
+ ((protocol == CNAT_TCP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_TCP)) || \
+ ((protocol == CNAT_ICMP) && \
+ (debug_i_flag & CNAT_DEBUG_ERR_ICMP))) { \
+ dslite_db_debug_error(&u_ki, err); \
+}
+
+#define PORT_LIMIT_LOW_THRESHOLD_FOR_SYSLOG 7
+/* If the max_limit is less than 10, no meaningful throttling can be
+ * done.. so, log only once per user and never clear the flag
+ * once the user exceeds limit
+ */
+#define CHECK_CLEAR_PORT_LIMIT_EXCEED_FLAG(udb, max_limit) \
+ if(PREDICT_FALSE(udb->flags & CNAT_USER_DB_PORT_LIMIT_EXCEEDED)) { \
+ if(udb->ntranslations < \
+ ((max_limit/10)*PORT_LIMIT_LOW_THRESHOLD_FOR_SYSLOG) && \
+ max_limit >= 10) { \
+ udb->flags = udb->flags & (~CNAT_USER_DB_PORT_LIMIT_EXCEEDED); \
+ } \
+ }
+
+#ifdef TOBE_PORTED
+/* Commented to remove unused variable warning */
+static char *debug_db_error[] = {
+ "no error", /* CNAT_SUCCESS */
+ "no config", /*CNAT_NO_CONFIG*/
+ "not in run state", /*CNAT_NO_VRF_RUN*/
+ "no pool for any", /*CNAT_NO_POOL_ANY*/
+ "no port for any", /*CNAT_NO_PORT_ANY*/
+ "bad in use for any", /*CNAT_BAD_INUSE_ANY*/
+ "not found for any", /*CNAT_NOT_FOUND_ANY*/
+ "invalid index for direct", /*CNAT_INV_PORT_DIRECT*/
+ "deleted addr for direct", /*CNAT_DEL_PORT_DIRECT*/
+ "bad in use for direct",/*CNAT_BAD_INUSE_DIRECT*/
+ "not found for direct",/*CNAT_NOT_FOUND_DIRECT*/
+ "out of port limit", /*CNAT_OUT_LIMIT*/
+ "main db limit", /*CNAT_MAIN_DB_LIMIT*/
+ "user db limit", /*CNAT_USER_DB_LIMIT*/
+ "not static port", /*CNAT_NOT_STATIC_PORT*/
+ "bad static port request", /*CNAT_BAD_STATIC_PORT_REQ*/
+ "not this core", /*CNAT_NOT_THIS_CORE*/
+ "parser error", /*CNAT_ERR_PARSER*/
+ "invalid msg id", /*CNAT_ERR_INVALID_MSG_ID*/
+ "invalid msg size", /*CNAT_ERR_INVALID_MSG_SIZE*/
+ "invalid payload size", /*CNAT_ERR_INVALID_PAYLOAD_SIZE*/
+ "bad tcp udp port", /*CNAT_ERR_BAD_TCP_UDP_PORT*/
+ "bulk single failure", /*CNAT_ERR_BULK_SINGLE_FAILURE*/
+ "xlat id invalid", /*CNAT_ERR_XLAT_ID_INVALID*/
+ "xlat v6 prefix invalid", /*CNAT_ERR_XLAT_V6_PREFIX_INVALID*/
+ "xlat v4 prefix invalid", /*CNAT_ERR_XLAT_V4_PREFIX_INVALID*/
+ "xlat tcp mss invalid", /*CNAT_ERR_XLAT_TCP_MSS_INVALID*/
+ "6rd id invalid", /*CNAT_ERR_6RD_ID_INVALID*/
+ "6rd v4 tunnel src invalid", /*CNAT_ERR_6RD_V4_TUNNEL_SRC_INVALID*/
+ "6rd v6 prefix invalid", /*CNAT_ERR_6RD_V6_PREFIX_INVALID*/
+ "6rd v6 BR unicast invalid", /*CNAT_ERR_6RD_V6_BR_UNICAST_INVALID*/
+ "6rd v4 prefix masklen invalid", /*CNAT_ERR_6RD_V4_PREFIX_MASK_LEN_INVALID*/
+ "6rd v4 suffix masklen invalid", /*CNAT_ERR_6RD_V4_SUFFIX_MASK_LEN_INVALID*/
+ "6rd v4 combo masklen invalid", /*CNAT_ERR_6RD_V4_COMBO_MASK_LEN_INVALID*/
+ "6rd tunnel mtu invalid", /*CNAT_ERR_6RD_TUNNEL_MTU_INVALID*/
+ "6rd tunnel ttl invalid", /*CNAT_ERR_6RD_TUNNEL_TTL_INVALID*/
+ "6rd tunnel tos invalid", /*CNAT_ERR_6RD_TUNNEL_TOS_INVALID*/
+};
+#endif
+
+f64 port_log_timestamps[HASH_TABLE_SIZE]; /* 32 KB array per core */
+
+void port_exceeded_msg_log (u32 src_addr, u16 i_vrf)
+{
+ u32 hash_value;
+ f64 current_timestamp;
+ vlib_main_t *vlib_main;
+
+ vlib_main = vlib_get_main();
+ current_timestamp = vlib_time_now((vlib_main_t *) vlib_main);
+
+ hash_value = ((src_addr >> 16) ^ ((src_addr & 0xffff) ^ i_vrf)) % (1024*8);
+
+ if (PREDICT_FALSE((current_timestamp - port_log_timestamps[hash_value]) > THROTTLE_TIME)) {
+ u32 arg[2] = {i_vrf, src_addr};
+ /* update timestamp */
+ port_log_timestamps[hash_value] = current_timestamp;
+ spp_printf(CNAT_USER_OUT_OF_PORTS, 2, arg);
+ }
+
+ return ;
+}
+
+static void log_port_alloc_error(cnat_errno_t error, cnat_key_t *k)
+{
+ u32 error_code;
+ u32 arr[] = {k->k.vrf, k->k.ipv4, k->k.port};
+ switch (error)
+ {
+ case CNAT_NO_POOL_ANY:
+ error_code = CNAT_NO_POOL_FOR_ANY_ERROR;
+ break;
+ case CNAT_NO_PORT_ANY:
+ error_code = CNAT_NO_PORT_FOR_ANY_ERROR;
+ break;
+ case CNAT_ERR_PARSER:
+ error_code = CNAT_WRONG_PORT_ALLOC_TYPE;
+ break;
+ case CNAT_BAD_INUSE_ANY:
+ error_code = CNAT_BAD_INUSE_ANY_ERROR;
+ break;
+ case CNAT_BAD_INUSE_DIRECT:
+ error_code = CNAT_BAD_INUSE_DIRECT_ERROR;
+ break;
+ case CNAT_NOT_FOUND_ANY:
+ error_code = CNAT_NOT_FOUND_ANY_ERROR;
+ break;
+ case CNAT_NOT_FOUND_DIRECT:
+ error_code = CNAT_NOT_FOUND_DIRECT_ERROR;
+ break;
+ case CNAT_INV_PORT_DIRECT:
+ error_code = CNAT_INV_PORT_FOR_DIRECT_ERROR;
+ break;
+ default:
+ error_code = CNAT_NEW_PORT_ALLOC_ERROR; /* If this code is seen in the log,
+ it means, new error codes are to be added here */
+ break;
+ }
+ spp_printf(error_code, 3, arr);
+}
+
+void cnat_db_debug_error(cnat_db_key_bucket_t *u_ki,
+ cnat_errno_t error)
+{
+ if (PREDICT_FALSE((u_ki->k.k.vrf == debug_i_vrf) &&
+ ((u_ki->k.k.ipv4 >= debug_i_addr_start) &&
+ (u_ki->k.k.ipv4 <= debug_i_addr_end)))) {
+#ifdef DEBUG_PRINTF_ENABLED
+ PLATFORM_DEBUG_PRINT("failed to allocate port due to %s "
+ "for i-vrf 0x%x addr 0x%x port 0x%x\n",
+ debug_db_error[error], u_ki->k.k.vrf,
+ u_ki->k.k.ipv4, u_ki->k.k.port);
+#endif
+ {
+ u32 arg[] = {u_ki->k.k.vrf, u_ki->k.k.ipv4, u_ki->k.k.port};
+ spp_printf(error, 3, arg);
+ }
+ }
+}
+
+void dslite_db_debug_error(dslite_db_key_bucket_t *u_ki,
+ cnat_errno_t error)
+{
+ if (PREDICT_FALSE((u_ki->dk.ipv4_key.k.vrf == debug_i_vrf) &&
+ ((u_ki->dk.ipv4_key.k.ipv4 >= debug_i_addr_start) &&
+ (u_ki->dk.ipv4_key.k.ipv4 <= debug_i_addr_end)))) {
+#ifdef DEBUG_PRINTF_ENABLED
+ PLATFORM_DEBUG_PRINT("failed to allocate port due to %s "
+ "for i-vrf 0x%x addr 0x%x port 0x%x\n",
+ debug_db_error[error], u_ki->dk.ipv4_key.k.vrf,
+ u_ki->dk.ipv4_key.k.ipv4, u_ki->dk.ipv4_key.k.port);
+#endif
+ {
+ u32 arg[] = {u_ki->dk.ipv4_key.k.vrf, u_ki->dk.ipv4_key.k.ipv4, u_ki->dk.ipv4_key.k.port};
+ spp_printf(error, 3, arg);
+ }
+ }
+}
+
+void cnat_db_debug_i2o_drop(cnat_db_key_bucket_t *ki)
+{
+ if (PREDICT_FALSE(((ki->k.k.vrf & CNAT_VRF_MASK) == debug_i_vrf) &&
+ ((ki->k.k.ipv4 >= debug_i_addr_start) &&
+ (ki->k.k.ipv4 <= debug_i_addr_end)))) {
+#ifdef DEBUG_PRINTF_ENABLED
+ PLATFORM_DEBUG_PRINT("pakcet[i-vrf 0x%x addr 0x%x port 0x%x] dropped\n",
+ ki->k.k.vrf, ki->k.k.ipv4, ki->k.k.port);
+#endif
+ {
+ u32 arg[] = {ki->k.k.vrf, ki->k.k.ipv4, ki->k.k.port};
+ spp_printf(CNAT_PACKET_DROP_ERROR, 3, arg);
+ }
+ }
+}
+
+void cnat_db_in2out_hash_delete (cnat_main_db_entry_t *ep, cnat_user_db_entry_t *up)
+{
+ u64 a, b, c;
+ u32 index, bucket;
+ cnat_main_db_entry_t *this, *prev;
+
+#ifdef DSLITE_DEF
+ if (PREDICT_FALSE(ep->flags & CNAT_DB_DSLITE_FLAG)) {
+ dslite_key_t dk = {
+ {up->ipv6[0], up->ipv6[1], up->ipv6[2], up->ipv6[3]} ,
+ {ep->in2out_key.k.ipv4, ep->in2out_key.k.port, ep->in2out_key.k.vrf}
+ };
+ DSLITE_V6_GET_HASH((&dk),
+ bucket,
+ CNAT_MAIN_HASH_MASK);
+ DSLITE_PRINTF(1, "Delete1 DSL main hash bucket ..%u\n", bucket);
+ } else {
+ CNAT_V4_GET_HASH(ep->in2out_key.key64,
+ bucket, CNAT_MAIN_HASH_MASK)
+ DSLITE_PRINTF(1, "Delete1 NAT44 main hash bucket ..%u\n", bucket);
+ }
+#else
+ CNAT_V4_GET_HASH(ep->in2out_key.key64,
+ bucket, CNAT_MAIN_HASH_MASK)
+#endif
+
+ index = cnat_in2out_hash[bucket].next;
+
+ ASSERT(index != EMPTY);
+
+ prev = 0;
+ do {
+ this = cnat_main_db + index;
+ if (PREDICT_TRUE(this == ep)) {
+ if (prev == 0) {
+ cnat_in2out_hash[bucket].next = ep->in2out_hash.next;
+ return;
+ } else {
+ prev->in2out_hash.next = ep->in2out_hash.next;
+ return;
+ }
+ }
+ prev = this;
+ index = this->in2out_hash.next;
+ } while (index != EMPTY);
+
+ ASSERT(0);
+}
+
+void cnat_db_out2in_hash_delete (cnat_main_db_entry_t *ep)
+{
+ u64 a, b, c;
+ u32 index, bucket;
+ cnat_main_db_entry_t *this, *prev;
+
+ CNAT_V4_GET_HASH(ep->out2in_key.key64,
+ bucket, CNAT_MAIN_HASH_MASK)
+
+ index = cnat_out2in_hash[bucket].next;
+
+ ASSERT(index != EMPTY);
+
+ prev = 0;
+ do {
+ this = cnat_main_db + index;
+ if (PREDICT_TRUE(this == ep)) {
+ if (prev == 0) {
+ cnat_out2in_hash[bucket].next = ep->out2in_hash.next;
+ return;
+ } else {
+ prev->out2in_hash.next = ep->out2in_hash.next;
+ return;
+ }
+ }
+ prev = this;
+ index = this->out2in_hash.next;
+ } while (index != EMPTY);
+
+ ASSERT(0);
+}
+
+cnat_main_db_entry_t*
+cnat_main_db_lookup_entry(cnat_db_key_bucket_t *ki)
+{
+ u64 a, b, c;
+ u32 index;
+ cnat_main_db_entry_t *db;
+
+ CNAT_V4_GET_HASH(ki->k.key64,
+ ki->bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ index = cnat_in2out_hash[ki->bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ return (NULL);
+ }
+
+ do {
+ db = cnat_main_db + index;
+ if (PREDICT_TRUE(db->in2out_key.key64 == ki->k.key64)) {
+ return db;
+ }
+ index = db->in2out_hash.next;
+ } while (index != EMPTY);
+
+ return (NULL);
+}
+
+void cnat_user_db_delete (cnat_user_db_entry_t *up)
+{
+ u64 a, b, c;
+ u32 index, bucket;
+ cnat_user_db_entry_t *this, *prev;
+
+ if (PREDICT_FALSE(up->flags & CNAT_USER_DB_NAT64_FLAG) != 0) {
+ /* Preventive check - Not a NAT44 entry */
+ return;
+ }
+
+#if 1
+ if(PREDICT_FALSE(up->flags & CNAT_USER_DB_DSLITE_FLAG)) {
+ dslite_key_t dk = {
+ {up->ipv6[0], up->ipv6[1], up->ipv6[2], up->ipv6[3]} ,
+ {{up->key.k.ipv4, up->key.k.port, up->key.k.vrf}}
+ };
+
+ DSLITE_V6_GET_HASH((&dk),
+ bucket,
+ CNAT_USER_HASH_MASK);
+ DSLITE_PRINTF(1, "Delete1 DSL user hash bucket ..%u\n", bucket);
+ } else {
+ CNAT_V4_GET_HASH(up->key.key64,
+ bucket, CNAT_USER_HASH_MASK)
+ DSLITE_PRINTF(1, "Delete1 NAT44 user hash bucket ..%u\n", bucket);
+ }
+#else
+ CNAT_V4_GET_HASH(up->key.key64,
+ bucket, CNAT_USER_HASH_MASK)
+ DSLITE_PRINTF(1, "Delete2 NAT44 user hash bucket ..%u\n", bucket);
+#endif
+
+ index = cnat_user_hash[bucket].next;
+
+ ASSERT(index != EMPTY);
+
+ prev = 0;
+ do {
+ this = cnat_user_db + index;
+ if (PREDICT_TRUE(this == up)) {
+ if (prev == 0) {
+ cnat_user_hash[bucket].next = up->user_hash.next;
+ goto found;
+ } else {
+ prev->user_hash.next = up->user_hash.next;
+ goto found;
+ }
+ }
+ prev = this;
+ index = this->user_hash.next;
+ } while (index != EMPTY);
+
+ ASSERT(0);
+
+ found:
+ pool_put(cnat_user_db, up);
+}
+
+cnat_user_db_entry_t*
+cnat_user_db_lookup_entry(cnat_db_key_bucket_t *uki)
+{
+ u64 a, b, c;
+ u32 index;
+ cnat_user_db_entry_t *udb=NULL;
+
+ CNAT_V4_GET_HASH(uki->k.key64,
+ uki->bucket,
+ CNAT_USER_HASH_MASK)
+
+ /* now: index in user vector */
+ index = cnat_user_hash[uki->bucket].next;
+ if (PREDICT_TRUE(index != EMPTY)) {
+ do {
+ udb = cnat_user_db + index;
+ if (PREDICT_FALSE(udb->key.key64 == uki->k.key64)) {
+ return udb;
+ }
+ index = udb->user_hash.next;
+ } while (index != EMPTY);
+ }
+ return (NULL);
+}
+
+cnat_user_db_entry_t*
+cnat_user_db_create_entry(cnat_db_key_bucket_t *uki,
+ u32 portmap_index)
+{
+ cnat_user_db_entry_t *udb = NULL;
+
+ pool_get(cnat_user_db, udb);
+ memset(udb, 0, sizeof(*udb));
+
+ udb->ntranslations = 1;
+ udb->portmap_index = portmap_index;
+ udb->key.key64 = uki->k.key64;
+ /* Add this user to the head of the bucket chain */
+ udb->user_hash.next =
+ cnat_user_hash[uki->bucket].next;
+ cnat_user_hash[uki->bucket].next = udb - cnat_user_db;
+
+#ifndef NO_BULK_LOGGING
+ INIT_BULK_CACHE(udb)
+#endif /* NO_BULK_LOGGING */
+ return udb;
+}
+
+cnat_main_db_entry_t*
+cnat_create_main_db_entry_and_hash(cnat_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb)
+{
+ u64 a, b, c;
+ u32 db_index;
+ cnat_main_db_entry_t *db = NULL;
+
+ pool_get(cnat_main_db, db);
+ memset(db, 0, sizeof(*db));
+
+ db_index = db - cnat_main_db;
+ db->in2out_key.k.ipv4 = ki->k.k.ipv4;
+ db->in2out_key.k.port = ki->k.k.port;
+ db->in2out_key.k.vrf = ki->k.k.vrf;
+ db->out2in_key.k.ipv4 = ko->k.k.ipv4;
+ db->out2in_key.k.port = ko->k.k.port;
+ db->out2in_key.k.vrf = ko->k.k.vrf;
+
+ db->user_ports.next = db_index;
+ db->user_ports.prev = db_index;
+ db->user_index = udb - cnat_user_db;
+ //db->portmap_index = udb->portmap_index;
+ db->flags &= ~(CNAT_DB_DSLITE_FLAG); // Mark that it is not dslite
+ if (PREDICT_FALSE(udb->ntranslations == 1)) {
+ /*
+ * first port for this src vrf/src ip addr
+ */
+ udb->translation_list_head_index = db_index;
+ } else {
+ index_dlist_addtail(udb->translation_list_head_index,
+ (u8 *)cnat_main_db, sizeof(cnat_main_db[0]),
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports),
+ db_index);
+ }
+
+ /*
+ * setup o2i hash key
+ */
+ CNAT_V4_GET_HASH(ko->k.key64,
+ ko->bucket,
+ CNAT_MAIN_HASH_MASK)
+ db->out2in_hash.next = cnat_out2in_hash[ko->bucket].next;
+ cnat_out2in_hash[ko->bucket].next = db_index;
+ /*
+ * setup i2o hash key, bucket is already calculate
+ */
+ db->in2out_hash.next = cnat_in2out_hash[ki->bucket].next;
+ cnat_in2out_hash[ki->bucket].next = db_index;
+
+#if DEBUG > 1
+ printf("\nMy_Instance_Number %d: Bucket %d, Db_Index %d",
+ my_instance_number, ki->bucket, db_index);
+ printf("\nInside (VRF 0x%x, IP 0x%x, PORT 0x%x)",
+ db->in2out_key.k.vrf, db->in2out_key.k.ipv4, db->in2out_key.k.port);
+ printf("\nOutside (VRF 0x%x, IP 0x%x, PORT 0x%x)",
+ db->out2in_key.k.vrf, db->out2in_key.k.ipv4, db->out2in_key.k.port);
+ printf("\nUser Index %d, IP 0x%x",
+ db->user_index, udb->key.k.ipv4);
+#endif
+
+ NAT44_COMMON_STATS.active_translations++;
+
+ return db;
+}
+
+static inline void pptp_clear_all_channels(
+ cnat_main_db_entry_t *db)
+{
+ u32 db_index, current_db_index;
+ cnat_main_db_entry_t *temp_db;
+
+ /* clear all channels */
+
+ db_index = db->proto_data.pptp_list.next;
+ current_db_index = db - cnat_main_db;
+
+ while( db_index != EMPTY) {
+ temp_db = cnat_main_db + db_index;
+ db_index = temp_db->proto_data.pptp_list.next;
+ temp_db->entry_expires = 0;
+ if(PREDICT_FALSE(temp_db->proto_data.pptp_list.prev
+ == current_db_index)) { // Decouple child GREs from parent
+ temp_db->proto_data.pptp_list.prev = EMPTY;
+ }
+ }
+
+ db->proto_data.pptp_list.next = EMPTY;
+}
+
+void pptp_remove_channel_from_tunnel(cnat_main_db_entry_t *db) {
+
+ cnat_main_db_entry_t *prev_db, *next_db;
+
+ prev_db = cnat_main_db + db->proto_data.pptp_list.prev;
+ next_db = cnat_main_db + db->proto_data.pptp_list.next;
+
+ /* remove entry from the tunnel list */
+ if(PREDICT_TRUE(db->proto_data.pptp_list.prev != EMPTY)) {
+ prev_db->proto_data.pptp_list.next =
+ db->proto_data.pptp_list.next ;
+ }
+
+ if(db->proto_data.pptp_list.next != EMPTY) {
+ next_db->proto_data.pptp_list.prev
+ = db->proto_data.pptp_list.prev;
+ }
+
+}
+
+void cnat_delete_main_db_entry_v2 (cnat_main_db_entry_t *ep)
+{
+ u32 main_db_index;
+ u32 vrfmap_len, udb_len;
+ cnat_user_db_entry_t *up =0;
+ cnat_portmap_v2_t *pm =0;
+ cnat_portmap_v2_t *my_pm =0;
+ cnat_vrfmap_t *my_vrfmap =0;
+ u16 static_port_range;
+#ifndef NO_BULK_LOGGING
+ bulk_alloc_size_t bulk_size;
+ int nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+ pool_header_t *h = pool_header(cnat_user_db);
+ u16 instance = 0;
+ u32 my_index;
+
+
+ if (PREDICT_FALSE(ep->flags & CNAT_DB_NAT64_FLAG) != 0) {
+ /* Preventive check - Not a NAT44 entry */
+ return;
+ }
+
+ if(PREDICT_FALSE(ep->flags &
+ CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE)) {
+ pptp_clear_all_channels(ep);
+ PPTP_DECR(active_tunnels);
+ }
+
+ if(PREDICT_FALSE(ep->flags &
+ CNAT_DB_FLAG_PPTP_GRE_ENTRY)) {
+ pptp_remove_channel_from_tunnel(ep);
+ PPTP_DECR(active_channels);
+ }
+
+ /* This function gets called from various locations..
+ * many times from config handler.. so we
+ * to ensure that multiple sessions if any are
+ * released
+ */
+
+ if(PREDICT_FALSE(ep->nsessions > 1)) {
+ cnat_session_entry_t *sdb;
+ while(ep->nsessions > 1 &&
+ ep->session_head_index != EMPTY) {
+ sdb = cnat_session_db + ep->session_head_index;
+ cnat_delete_session_db_entry(sdb, TRUE);
+ }
+ }
+
+ /* Find the set of portmaps for the outside vrf */
+ vrfmap_len = vec_len(cnat_map_by_vrf);
+ udb_len = vec_len(cnat_user_db);
+
+ /* In case of invalid user just return, deleting only main db
+ * is not a good idea, since some valid user db entry might be pointing
+ * to that main db and hence leave the dbs in a inconsistent state
+ */
+ if (PREDICT_FALSE((ep->user_index >= udb_len) ||
+ (clib_bitmap_get(h->free_bitmap, ep->user_index)))) {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("invalid/unused user index in db %d\n", ep->user_index);
+#endif
+ spp_printf(CNAT_INV_UNUSED_USR_INDEX, 1, (u32 *) &(ep->user_index));
+ cnat_main_db_entry_dump(ep);
+ return;
+ }
+
+ up = cnat_user_db + ep->user_index;
+
+/* Point to the right portmap list */
+if (PREDICT_FALSE(ep->flags & CNAT_DB_DSLITE_FLAG)) {
+ instance = ep->dslite_nat44_inst_id;
+ pm = dslite_table_db_ptr[instance].portmap_list;
+ if(PREDICT_FALSE((pm == NULL))) {
+ DSLITE_PRINTF(3, "NULL portmap list for dslite_id %u, state %u\n",
+ instance, dslite_table_db_ptr[instance].state);
+ cnat_main_db_entry_dump(ep);
+ goto delete_entry;
+ }
+ static_port_range =
+ STAT_PORT_RANGE_FROM_INST_PTR(&(dslite_table_db_ptr[instance]));
+ /*
+ * Netflow logging API for delete event
+ */
+ bulk_size =
+ BULKSIZE_FROM_VRFMAP(&(dslite_table_db_ptr[instance]));
+} else {
+ if (PREDICT_FALSE(ep->vrfmap_index >= vrfmap_len)) {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("invalid vrfmap index in db\n");
+#endif
+ spp_printf(CNAT_INVALID_VRFMAP_INDEX, 0, NULL);
+ cnat_main_db_entry_dump(ep);
+ goto delete_entry;
+ }
+ instance = NAT44_RESERVED_INST_ID;
+ my_vrfmap = cnat_map_by_vrf + ep->vrfmap_index;
+ pm = my_vrfmap->portmap_list;
+ static_port_range = cnat_static_port_range;
+ bulk_size = BULKSIZE_FROM_VRFMAP(my_vrfmap);
+}
+
+ if (PREDICT_FALSE(ep->flags & CNAT_DB_FLAG_PORT_PAIR)) {
+ /* Give back the port(s) */
+ cnat_port_free_v2_bulk(pm, up->portmap_index,
+ PORT_PAIR, ep->out2in_key.k.port, up, static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_size, &nfv9_log_req
+#endif
+ );
+ } else {
+ /* Give back the port(s) */
+ cnat_port_free_v2_bulk (pm, up->portmap_index,
+ PORT_SINGLE, ep->out2in_key.k.port, up, static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_size, &nfv9_log_req
+#endif
+ );
+ }
+
+ if (PREDICT_TRUE(!(ep->flags & CNAT_DB_DSLITE_FLAG))) {
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ if(PREDICT_FALSE(my_vrfmap->nf_logging_policy == SESSION_LOG_ENABLE)) {
+ if(ep->nsessions != 0) {
+ cnat_nfv9_nat44_log_session_delete(ep, NULL, my_vrfmap);
+ }
+ } else {
+ cnat_nfv9_log_mapping_delete(ep, my_vrfmap
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ if(PREDICT_TRUE((my_vrfmap->syslog_logging_policy != SESSION_LOG_ENABLE) ||
+ (ep->nsessions != 0))) {
+ cnat_syslog_nat44_mapping_delete(ep, my_vrfmap, NULL
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ }
+ } else {
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ if(PREDICT_FALSE( dslite_table_db_ptr[instance].nf_logging_policy ==
+ SESSION_LOG_ENABLE)) {
+ cnat_nfv9_ds_lite_log_session_delete(ep,
+ (dslite_table_db_ptr + instance),NULL);
+ } else {
+ cnat_nfv9_ds_lite_mapping_delete(ep,
+ (dslite_table_db_ptr + instance)
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+#ifdef TOBE_PORTED
+ cnat_syslog_ds_lite_mapping_delete(ep,
+ (dslite_table_db_ptr + instance), NULL
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+#endif /* TOBE_PORTED */
+ }
+ }
+
+delete_entry:
+
+ main_db_index = ep - cnat_main_db;
+
+ up->ntranslations--;
+
+ /*
+ * when user reaches max allowed port limit
+ * we generate icmp msg and inc the counter
+ * when counter reach the icmp msg rate limit
+ * we stop icmp msg gen
+ * when a user port is freed
+ * that means we need to clear the msg gen counter
+ * so that next time
+ * reach max port limit, we can generate new icmp msg again
+ */
+ up->icmp_msg_count = 0;
+
+ up->translation_list_head_index = index_dlist_remelem (
+ up->translation_list_head_index, (u8 *)cnat_main_db,
+ sizeof (cnat_main_db[0]),
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports),
+ main_db_index);
+
+ cnat_db_in2out_hash_delete(ep, up);
+
+ if (PREDICT_FALSE(up->ntranslations == 0)) {
+ ASSERT(up->translation_list_head_index == EMPTY);
+ nat44_dslite_common_stats[instance].num_subscribers--;
+ my_index = up->portmap_index;
+ my_pm = pm + my_index;
+ if(PREDICT_TRUE(my_pm->private_ip_users_count)) {
+ my_pm->private_ip_users_count--;
+#ifdef DEBUG_PRINTF_IP_N_TO_1_ENABLED
+ PLATFORM_DEBUG_PRINT("\n cnat_delete_main_db_entry_v2 "
+ "private_ip_users_count = %d",
+ my_pm->private_ip_users_count);
+#endif
+
+ }
+ cnat_user_db_delete(up);
+
+ }
+
+ /* Remove from main DB hashes */
+ //cnat_db_in2out_hash_delete(ep);
+ cnat_db_out2in_hash_delete(ep);
+
+ pool_put(cnat_main_db, ep);
+
+ if(PREDICT_FALSE(ep->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ nat44_dslite_common_stats[instance].num_static_translations--;
+ } else {
+ nat44_dslite_common_stats[instance].num_dynamic_translations--;
+ }
+ nat44_dslite_common_stats[instance].active_translations--;
+ nat44_dslite_global_stats[!!(instance - 1)].translation_delete_count ++;
+}
+
+cnat_main_db_entry_t*
+cnat_main_db_lookup_entry_out2in (cnat_db_key_bucket_t *ko)
+{
+ u64 a, b, c;
+ u32 index;
+ cnat_main_db_entry_t *db;
+
+ CNAT_V4_GET_HASH(ko->k.key64,
+ ko->bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ index = cnat_out2in_hash[ko->bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ return (NULL);
+ }
+
+ do {
+ db = cnat_main_db + index;
+ if (PREDICT_TRUE(db->out2in_key.key64 == ko->k.key64)) {
+ return db;
+ }
+ index = db->out2in_hash.next;
+ } while (index != EMPTY);
+
+ return (NULL);
+}
+
+/* Creates 2 sessions.
+ * Moves the default dest info from mdb to first session
+ * Fills the dest_info details in to second session and
+ * returns the pointer to second session
+ */
+cnat_session_entry_t *cnat_handle_1to2_session(
+ cnat_main_db_entry_t *mdb,
+ cnat_key_t *dest_info)
+{
+ cnat_key_t old_dest_info;
+ pool_header_t *h;
+ u32 free_session = 0;
+ u16 instance;
+ cnat_session_entry_t *session_db1 = NULL, *session_db2 = NULL;
+
+ h = pool_header(cnat_session_db);
+ free_session = vec_len(h->free_indices) - 1;
+
+ if (PREDICT_FALSE(free_session < 2)) {
+ if (mdb->flags & CNAT_DB_DSLITE_FLAG) {
+ instance = mdb->dslite_nat44_inst_id;
+ } else {
+ instance = NAT44_RESERVED_INST_ID;
+ }
+
+ /* we need 2 sessions here, return NULL */
+ nat44_dslite_common_stats[instance].drops_sessiondb_limit_exceeded++;
+ return NULL;
+ }
+
+ old_dest_info.k.ipv4 = mdb->dst_ipv4;
+ old_dest_info.k.port = mdb->dst_port;
+ old_dest_info.k.vrf = mdb->in2out_key.k.vrf;
+
+ /* create 2 new sessions */
+ session_db1 = cnat_create_session_db_entry(&old_dest_info,
+ mdb, FALSE);
+
+ if(PREDICT_FALSE(session_db1 == NULL)) {
+ return NULL;
+ }
+
+ /* update pkt info to session 2 */
+ session_db2 = cnat_create_session_db_entry(dest_info,
+ mdb, TRUE);
+
+ if(PREDICT_FALSE(session_db2 == NULL)) {
+ cnat_delete_session_db_entry(session_db1, FALSE);
+ return NULL;
+ }
+ /* update main db info to session 1 */
+ cnat_dest_update_main2session(mdb, session_db1);
+
+ return session_db2;
+}
+
+/* The below function shold be called only
+ * when a NAT44 STATIC entry received traffic
+ * for the first time. This is to ensure
+ * the destination is noted and logged
+ */
+void cnat_add_dest_n_log(
+ cnat_main_db_entry_t *mdb,
+ cnat_key_t *dest_info)
+{
+
+ if(PREDICT_FALSE(mdb->nsessions != 0)) {
+ return; /* Should not have been called */
+ }
+
+ mdb->dst_ipv4 = dest_info->k.ipv4;
+ mdb->dst_port = dest_info->k.port;
+ mdb->nsessions = 1;
+ mdb->entry_expires = cnat_current_time;
+ u16 instance;
+
+ if (mdb->flags & CNAT_DB_DSLITE_FLAG) {
+ instance = mdb->dslite_nat44_inst_id;
+ cnat_session_log_ds_lite_mapping_create(mdb,
+ (dslite_table_db_ptr + instance),NULL);
+ } else {
+ instance = NAT44_RESERVED_INST_ID;
+ cnat_vrfmap_t *my_vrfmap = cnat_map_by_vrf + mdb->vrfmap_index;
+ cnat_session_log_nat44_mapping_create(mdb, 0, my_vrfmap);
+ }
+}
+
+/*
+ * this function is called by exception node
+ * when lookup is fialed in i2o node
+ *
+ * if reash per user port limit,
+ * set user_db_entry pointer, and error == CNAT_OUT_LIMIT
+ */
+cnat_main_db_entry_t*
+cnat_get_main_db_entry_v2(cnat_db_key_bucket_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info,
+ cnat_key_t *dest_info)
+{
+ u16 protocol;
+ cnat_errno_t rv;
+ cnat_db_key_bucket_t u_ki, ko;
+ u32 my_index, free_main, free_user;
+ u32 current_timestamp;
+ u16 my_vrfmap_index;
+ u16 my_vrfmap_entry_found = 0;
+ cnat_vrfmap_t *my_vrfmap =0;
+ cnat_portmap_v2_t *pm =0;
+ cnat_user_db_entry_t *udb = 0;
+ cnat_main_db_entry_t *db = 0;
+ pool_header_t *h;
+ u16 port_limit;
+ cnat_portmap_v2_t *my_pm = 0;
+
+#ifndef NO_BULK_LOGGING
+ int nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+
+
+ /*
+ * need to try lookup again because
+ * second pkt may come here before the entry is created
+ * by receiving first pkt due to high line rate.
+ */
+ info->gen_icmp_msg = CNAT_NO_ICMP_MSG;
+ info->error = CNAT_SUCCESS;
+ db = cnat_main_db_lookup_entry(ki);
+ if (PREDICT_TRUE(db)) {
+ /* what if the source is talking to a
+ * new dest now? We will have to handle this case and
+ * take care of - creating session db and logging
+ */
+ if(PREDICT_FALSE((!dest_info->k.ipv4) && (!dest_info->k.port))) {
+ return db; /* if dest_info is null don't create session */
+ }
+ if(PREDICT_TRUE((db->dst_ipv4 == dest_info->k.ipv4) &&
+ (db->dst_port == dest_info->k.port))) {
+ return db;
+ }
+ dest_info->k.vrf = db->in2out_key.k.vrf;
+ /* Src is indeed talking to a different dest */
+ cnat_session_entry_t *session_db2 = NULL;
+ if(PREDICT_TRUE(db->nsessions == 1)) {
+ session_db2 = cnat_handle_1to2_session(db, dest_info);
+ if(PREDICT_TRUE(session_db2 != NULL)) {
+ CNAT_DB_TIMEOUT_RST(session_db2);
+ return db;
+ } else {
+ info->error = CNAT_ERR_NO_SESSION_DB;
+ return NULL;
+ }
+ } else if(PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be static entry.. should never happen
+ */
+ if(PREDICT_TRUE(dest_info->k.ipv4 != 0)) {
+ cnat_add_dest_n_log(db, dest_info);
+ }
+ return db;
+ } else {
+ /* The src has already created multiple sessions.. very rare
+ */
+ session_db2 = cnat_create_session_db_entry(dest_info,
+ db, TRUE);
+ if(PREDICT_TRUE(session_db2 != NULL)) {
+ CNAT_DB_TIMEOUT_RST(session_db2);
+ return db;
+ } else {
+ info->error = CNAT_ERR_NO_SESSION_DB;
+ return NULL;
+ }
+ }
+
+ }
+
+ /*
+ * step 1. check if outside vrf is configured or not
+ * and Find the set of portmaps for the outside vrf
+ * insider vrf is one to one mappted to outside vrf
+ * key is vrf and ip only
+ * ki.k.k.vrf has protocol bits, mask out
+ */
+ protocol = ki->k.k.vrf & CNAT_PRO_MASK;
+ u_ki.k.k.vrf = ki->k.k.vrf & CNAT_VRF_MASK;
+ u_ki.k.k.ipv4 = ki->k.k.ipv4;
+ u_ki.k.k.port = 0;
+
+ my_vrfmap_index = vrf_map_array[u_ki.k.k.vrf];
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+
+ my_vrfmap_entry_found = ((my_vrfmap_index != VRF_MAP_ENTRY_EMPTY) &&
+ (my_vrfmap->status == S_RUN) &&
+ (my_vrfmap->i_vrf == u_ki.k.k.vrf));
+
+ if (PREDICT_FALSE(!my_vrfmap_entry_found)) {
+ u32 arr[] = {ki->k.k.vrf, ki->k.k.ipv4, ki->k.k.port};
+ if ((my_vrfmap_index == VRF_MAP_ENTRY_EMPTY) ||
+ (my_vrfmap->i_vrf == u_ki.k.k.vrf)) {
+ info->error = CNAT_NO_CONFIG;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_NO_CONFIG)
+ spp_printf(CNAT_NO_CONFIG_ERROR, 3, arr);
+ } else {
+ info->error = CNAT_NO_VRF_RUN;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_NO_VRF_RUN)
+ spp_printf(CNAT_NO_VRF_RUN_ERROR, 3, arr);
+ }
+
+ return (NULL);
+ }
+
+ pm = my_vrfmap->portmap_list;
+
+ port_limit = my_vrfmap->port_limit;
+ if(PREDICT_FALSE(!port_limit)) {
+ port_limit = cnat_main_db_max_ports_per_user;
+ }
+ /*
+ * set o2i key with protocl bits
+ */
+ ko.k.k.vrf = my_vrfmap->o_vrf | protocol;
+
+ /*
+ * step 2. check if src vrf, src ip addr is alreay
+ * in the user db
+ * if yes, use PORT_ALLOC_DIRECTED
+ * if no, use PORT_ALLOC_ANY since it is first time
+ */
+ udb = cnat_user_db_lookup_entry(&u_ki);
+ if (PREDICT_TRUE(udb)) {
+ /*
+ * not first time allocate port for this user
+ * check limit
+ */
+ if (PREDICT_FALSE(udb->ntranslations >=
+ port_limit)) {
+ /* Check for the port type here. If we are getting
+ * a STATIC PORT, allow the config.
+ */
+ if (PREDICT_TRUE(port_type != PORT_TYPE_STATIC)) {
+ info->error = CNAT_OUT_LIMIT;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(CNAT_OUT_LIMIT)
+ port_exceeded_msg_log(u_ki.k.k.ipv4, u_ki.k.k.vrf);
+ in2out_drops_port_limit_exceeded ++;
+ u_ki.k.k.port = ki->k.k.port;
+ u_ki.k.k.vrf = ki->k.k.vrf;
+ handle_cnat_port_exceeded_logging(udb, &u_ki.k, my_vrfmap);
+ return (NULL);
+ }
+ }
+ CHECK_CLEAR_PORT_LIMIT_EXCEED_FLAG(udb,
+ port_limit)
+
+ /*
+ * check if main db has space to accomodate new entry
+ */
+ h = pool_header(cnat_main_db);
+
+ free_main = vec_len(h->free_indices) - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ CNAT_SET_ICMP_MSG_INFO
+ in2out_drops_system_limit_reached ++;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ current_timestamp = spp_trace_log_get_unix_time_in_seconds();
+ if (PREDICT_FALSE((current_timestamp - last_log_timestamp) >
+ 1800)) {
+ spp_printf(CNAT_SESSION_THRESH_EXCEEDED, 0, NULL);
+ last_log_timestamp = current_timestamp;
+ }
+
+#ifdef UT_TEST_CODE
+ printf("Limit reached : OLD USER");
+#endif
+ return NULL;
+ }
+
+ /*
+ * allocate port, from existing mapping
+ */
+ my_index = udb->portmap_index;
+
+ if (PREDICT_FALSE(port_type == PORT_TYPE_STATIC)) {
+ rv = cnat_static_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ ki->k.k.ipv4,
+ ki->k.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ ,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1
+ );
+
+ } else if (PREDICT_TRUE(port_type != PORT_TYPE_RTSP) ) {
+
+ rv = cnat_dynamic_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ ,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1,
+ &(my_vrfmap->rseed_ip)
+ );
+
+ } else {
+ /*
+ * For RTSP, two translation entries are created,
+ * check if main db has space to accomodate two new entry
+ */
+ free_main = free_main - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ CNAT_SET_ICMP_MSG_INFO
+ in2out_drops_system_limit_reached ++;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ return NULL;
+ } else {
+ rv = cnat_dynamic_port_alloc_rtsp_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ ki->k.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ ,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , &(my_vrfmap->rseed_ip)
+ );
+ }
+ }
+
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(rv)
+ in2out_drops_resource_depletion++;
+ log_port_alloc_error(rv, &(ki->k));
+ return (NULL);
+ }
+ /*
+ * increment port in use for this user
+ */
+ udb->ntranslations += 1;
+
+ } else {
+ /*
+ * first time allocate port for this user
+ */
+
+ /*
+ * Do not create entry if port limit is invalid
+ */
+
+ if (PREDICT_FALSE(!port_limit)) {
+ if (PREDICT_TRUE(port_type != PORT_TYPE_STATIC)) {
+ info->error = CNAT_OUT_LIMIT;
+ in2out_drops_port_limit_exceeded ++;
+ port_exceeded_msg_log(u_ki.k.k.ipv4, u_ki.k.k.vrf);
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(CNAT_OUT_LIMIT)
+ return (NULL);
+ }
+ }
+
+ /*
+ * Check if main db has space for new entry
+ * Allowing a user db entry to be created if main db is not free
+ * will cause a port to be allocated to that user, which results in
+ * wastage of that port, hence the check is done here.
+ */
+ h = pool_header(cnat_main_db);
+ free_main = vec_len(h->free_indices) - 1;
+ h = pool_header(cnat_user_db);
+ free_user = vec_len(h->free_indices) - 1;
+
+ /*
+ * If either main_db or user_db does not have entries
+ * bail out, with appropriate error
+ */
+ if (PREDICT_FALSE(!(free_main && free_user))) {
+ u32 log_error;
+ if(free_main) {
+ info->error = CNAT_USER_DB_LIMIT;
+ log_error = CNAT_USER_DB_LIMIT_ERROR;
+ } else {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ log_error = CNAT_MAIN_DB_LIMIT_ERROR;
+ }
+ in2out_drops_system_limit_reached ++;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(info->error)
+ spp_printf(log_error, 0, 0);
+ return NULL;
+ }
+
+ if (PREDICT_FALSE(port_type == PORT_TYPE_STATIC)) {
+ rv = cnat_static_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_ANY,
+ port_pair_type,
+ ki->k.k.ipv4,
+ ki->k.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ ,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1
+ );
+
+ } else if (PREDICT_TRUE(port_type != PORT_TYPE_RTSP)) {
+ rv = cnat_dynamic_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_ANY,
+ port_pair_type,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , NULL, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1,
+ &(my_vrfmap->rseed_ip)
+ );
+ } else {
+ /*
+ * For RTSP, two translation entries are created,
+ * check if main db has space to accomodate two new entry
+ */
+ free_main = free_main - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ CNAT_SET_ICMP_MSG_INFO
+ in2out_drops_system_limit_reached ++;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ return NULL;
+ } else {
+
+ rv = cnat_dynamic_port_alloc_rtsp_bulk(pm,
+ PORT_ALLOC_ANY,
+ port_pair_type,
+ ki->k.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , NULL, BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , &(my_vrfmap->rseed_ip)
+ );
+ /* TODO: Add the port pair flag here */
+ }
+ }
+
+
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ in2out_drops_resource_depletion ++;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->k));
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = cnat_user_db_create_entry(&u_ki, my_index);
+ NAT44_COMMON_STATS.num_subscribers++;
+ my_pm = pm + my_index;
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < PORTS_PER_ADDR)) {
+ my_pm->private_ip_users_count++;
+#ifdef DEBUG_PRINTF_IP_N_TO_1_ENABLED
+ PLATFORM_DEBUG_PRINT("\n cnat_get_main_db_entry_v2 "
+ "dynamic alloc private_ip_users_count = %d",
+ my_pm->private_ip_users_count);
+#endif
+ } else {
+ PLATFORM_DEBUG_PRINT("\n ERROR: private_ip_users_count has "
+ "reached MAX PORTS_PER_ADDR");
+ }
+#ifndef NO_BULK_LOGGING
+ if(PREDICT_TRUE(udb && (BULK_ALLOC_NOT_ATTEMPTED != nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, ko.k.k.port,
+ BULKSIZE_FROM_VRFMAP(my_vrfmap));
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+
+ }
+
+ /*
+ * step 3:
+ * outside port is allocated for this src vrf/src ip addr
+ * 1)create a new entry in main db
+ * 2)setup cnat_out2in_hash key
+ * 3)setup cnat_in2out_hash key
+ */
+ db = cnat_create_main_db_entry_and_hash(ki, &ko, udb);
+
+ translation_create_count ++;
+#ifdef DSLITE_DEF
+ db->dslite_nat44_inst_id = NAT44_RESERVED_INST_ID;
+#endif
+ db->vrfmap_index = my_vrfmap - cnat_map_by_vrf;
+
+ /*
+ * don't forget logging
+ * logging API is unconditional,
+ * logging configuration check is done inside the inline function
+ */
+
+ db->dst_ipv4 = dest_info->k.ipv4;
+ db->dst_port = dest_info->k.port;
+ if(PREDICT_TRUE(db->dst_ipv4 || db->dst_port)) {
+ db->nsessions++;
+ }
+
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ if(PREDICT_FALSE(my_vrfmap->nf_logging_policy == SESSION_LOG_ENABLE)) {
+ /* do not log for static entries.. we will log when traffic flows */
+ if(PREDICT_TRUE(db->dst_ipv4 || db->dst_port)) {
+ cnat_nfv9_nat44_log_session_create(db, 0, my_vrfmap);
+ }
+ } else {
+ cnat_nfv9_log_mapping_create(db, my_vrfmap
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ if(PREDICT_TRUE((my_vrfmap->syslog_logging_policy != SESSION_LOG_ENABLE) ||
+ (db->dst_ipv4 || db->dst_port))) {
+ cnat_syslog_nat44_mapping_create(db, my_vrfmap, 0
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ }
+ if (PREDICT_FALSE(port_pair_type == PORT_PAIR)) {
+ cnat_main_db_entry_t *db2 = 0;
+ cnat_db_key_bucket_t new_ki = *ki;
+ u64 a, b, c;
+
+ new_ki.k.k.port += 1;
+ ko.k.k.port += 1;
+
+ CNAT_V4_GET_HASH(new_ki.k.key64, new_ki.bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ db2 = cnat_create_main_db_entry_and_hash(&new_ki, &ko, udb);
+
+ translation_create_count ++;
+#ifdef DSLITE_DEF
+ db2->dslite_nat44_inst_id = NAT44_RESERVED_INST_ID;
+#endif
+ db2->vrfmap_index = my_vrfmap - cnat_map_by_vrf;
+ db2->entry_expires = cnat_current_time;
+ db2->flags |= CNAT_DB_FLAG_ALG_ENTRY;
+ udb->ntranslations += 1;
+ db2->dst_ipv4 = dest_info->k.ipv4;
+ db2->dst_port = dest_info->k.port;
+ db2->nsessions = 0; /* For ALG db, set sessions to 0 - CSCuf78420 */
+
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ if(PREDICT_FALSE(my_vrfmap->nf_logging_policy == SESSION_LOG_ENABLE)) {
+ /* do not log for static entries.. we will log when traffic flows */
+ if(PREDICT_TRUE(db2->dst_ipv4 || db2->dst_port)) {
+ cnat_nfv9_nat44_log_session_create(db2, 0, my_vrfmap);
+ }
+ } else {
+ cnat_nfv9_log_mapping_create(db2, my_vrfmap
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ if(PREDICT_TRUE((my_vrfmap->syslog_logging_policy != SESSION_LOG_ENABLE) ||
+ (db2->dst_ipv4 || db2->dst_port))) {
+ cnat_syslog_nat44_mapping_create(db2, my_vrfmap, 0
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ }
+ }
+
+ return db;
+}
+
+/*
+ * this function is called from config handler only
+ * to allocate a static port based db entry
+ *
+ * the actual mapped address and port are already specified
+ */
+cnat_main_db_entry_t*
+cnat_create_static_main_db_entry_v2 (cnat_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_vrfmap_t *my_vrfmap,
+ cnat_gen_icmp_info *info)
+{
+ u16 protocol;
+ u32 head;
+ cnat_errno_t rv;
+ cnat_db_key_bucket_t u_ki;
+ u32 my_index, free_main, free_user;
+ cnat_portmap_v2_t *pm =0;
+ cnat_portmap_v2_t *my_pm =0;
+ cnat_user_db_entry_t *udb = 0;
+ cnat_main_db_entry_t *db = 0;
+ pool_header_t *h;
+#ifndef NO_BULK_LOGGING
+ int nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+
+ /*
+ * need to try lookup again because
+ * second pkt may come here before the entry is created
+ * by receiving first pkt due to high line rate.
+ */
+ info->gen_icmp_msg = CNAT_NO_ICMP_MSG;
+ info->error = CNAT_SUCCESS;
+ db = cnat_main_db_lookup_entry(ki);
+
+ /*
+ * If we already have an entry with this inside address, port
+ * check delete the entry and proceed further. This should
+ * If yes, something is terribly wrong. Bail out
+ */
+ if (PREDICT_FALSE(db)) {
+
+ if (db->flags & CNAT_DB_FLAG_STATIC_PORT) {
+
+ if ((db->out2in_key.k.ipv4 == ko->k.k.ipv4) &&
+ (db->out2in_key.k.port == ko->k.k.port) &&
+ (db->out2in_key.k.vrf == ko->k.k.vrf)) {
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Same Static Port Exists ki 0x%16llx ko 0x%16llx",
+ ki->k, ko->k);
+#endif
+ /*
+ * We have already programmed this, return
+ */
+ return (db);
+ }
+
+ /*
+ * We already have a static port with different mapping
+ * Return an error for this case.
+ */
+ info->error = CNAT_ERR_PARSER;
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Static Port Existing and Diff ki 0x%16llx ko 0x%16llx",
+ ki, db->out2in_key);
+#endif
+ {
+ u32 arr[] = {STAT_PORT_CONFIG_IN_USE, (ki->k.k.vrf & CNAT_VRF_MASK),
+ ki->k.k.ipv4, ki->k.k.port, (ki->k.k.vrf & CNAT_PRO_MASK) };
+ spp_printf(CNAT_CONFIG_ERROR, 5, arr);
+ }
+ return (db);
+ }
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Deleting Dynamic entry ki 0x%16llx ko 0x%16llx",
+ ki, db->out2in_key);
+#endif
+
+ /*
+ * If for some reason we have dynamic entries, just delete them
+ * and proceed.
+ */
+ cnat_delete_main_db_entry_v2(db);
+
+ db = NULL;
+ }
+
+ protocol = ki->k.k.vrf & CNAT_PRO_MASK;
+ u_ki.k.k.vrf = ki->k.k.vrf & CNAT_VRF_MASK;
+ u_ki.k.k.ipv4 = ki->k.k.ipv4;
+ u_ki.k.k.port = 0;
+
+ pm = my_vrfmap->portmap_list;
+
+ /*
+ * check if src vrf, src ip addr is already
+ * in the user db
+ * if yes, use PORT_ALLOC_DIRECTED
+ * if no, use PORT_ALLOC_ANY since it is first time
+ */
+ udb = cnat_user_db_lookup_entry(&u_ki);
+ if (PREDICT_TRUE(udb)) {
+ /*
+ * check if main db has space to accomodate new entry
+ */
+ h = pool_header(cnat_main_db);
+
+ free_main = vec_len(h->free_indices) - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ CNAT_SET_ICMP_MSG_INFO
+ in2out_drops_system_limit_reached ++;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+#ifdef UT_TEST_CODE
+ printf("Limit reached : OLD USER");
+#endif
+ spp_printf(CNAT_MAIN_DB_LIMIT_ERROR, 0, 0);
+ return NULL;
+ }
+
+ /*
+ * allocate port, from existing mapping
+ */
+ my_index = udb->portmap_index;
+ my_pm = pm + my_index;
+ /* It is quite possible that we hit the scenario of CSCtj17774.
+ * Delete all the main db entries and add the ipv4 address sent by
+ * CGN-MA as Static port alloc any
+ */
+
+ if (PREDICT_FALSE(my_pm->ipv4_address != ko->k.k.ipv4)) {
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ printf("Delete Main db entry and check for"
+ " ipv4 address sanity pm add = 0x%x ip add = 0x%x\n",
+ my_pm->ipv4_address, ko->k.k.ipv4);
+ }
+ do {
+ /* udb is not NULL when we begin with for sure */
+ head = udb->translation_list_head_index;
+ db = cnat_main_db + head;
+ cnat_delete_main_db_entry_v2(db);
+ } while (!pool_is_free(cnat_user_db, udb));
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_ANY, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ in2out_drops_resource_depletion ++;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(rv)
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = cnat_user_db_create_entry(&u_ki, my_index);
+ my_pm = pm + my_index;
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < PORTS_PER_ADDR)) {
+ my_pm->private_ip_users_count++;
+#ifdef DEBUG_PRINTF_IP_N_TO_1_ENABLED
+ PLATFORM_DEBUG_PRINT("\n cnat_create_static_main_db_entry_v2 "
+ "static del n alloc private_ip_users_count = "
+ "%d",my_pm->private_ip_users_count);
+#endif
+ } else {
+ PLATFORM_DEBUG_PRINT("\n ERROR: private_ip_users_count has "
+ "reached MAX PORTS_PER_ADDR");
+ }
+ NAT44_COMMON_STATS.num_subscribers++;
+#ifndef NO_BULK_LOGGING
+ cnat_update_bulk_range_cache(udb, ko->k.k.port,
+ BULKSIZE_FROM_VRFMAP(my_vrfmap));
+#endif /* #ifndef NO_BULK_LOGGING */
+ } else {
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_DIRECTED, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->k));
+ return (NULL);
+ }
+
+ /*
+ * increment port in use for this user
+ */
+ udb->ntranslations += 1;
+ }
+ } else {
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ printf ("Static port alloc any\n");
+ }
+ /*
+ * first time allocate port for this user
+ */
+
+ /*
+ * Check if main db has space for new entry
+ * Allowing a user db entry to be created if main db is not free
+ * will cause a port to be allocated to that user, which results in
+ * wastage of that port, hence the check is done here.
+ */
+ h = pool_header(cnat_main_db);
+ free_main = vec_len(h->free_indices) - 1;
+ h = pool_header(cnat_user_db);
+ free_user = vec_len(h->free_indices) - 1;
+
+ /*
+ * If either main_db or user_db does not have entries
+ * bail out, with appropriate error
+ */
+ if (PREDICT_FALSE(!(free_main && free_user))) {
+ u32 log_error;
+ if(free_main) {
+ info->error = CNAT_USER_DB_LIMIT;
+ log_error = CNAT_USER_DB_LIMIT_ERROR;
+ } else {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ log_error = CNAT_MAIN_DB_LIMIT_ERROR;
+ }
+ in2out_drops_system_limit_reached ++;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(info->error)
+ spp_printf(log_error, 0, 0);
+ return NULL;
+ }
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_ANY, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(my_vrfmap), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ in2out_drops_resource_depletion ++;
+ CNAT_SET_ICMP_MSG_INFO
+ CNAT_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->k));
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = cnat_user_db_create_entry(&u_ki, my_index);
+ my_pm = pm + my_index;
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < PORTS_PER_ADDR)) {
+ my_pm->private_ip_users_count++;
+#ifdef DEBUG_PRINTF_IP_N_TO_1_ENABLED
+ PLATFORM_DEBUG_PRINT("\n cnat_create_static_main_db_entry_v2 "
+ "static alloc private_ip_users_count = %d",
+ my_pm->private_ip_users_count);
+#endif
+ } else {
+ PLATFORM_DEBUG_PRINT("\n ERROR: private_ip_users_count has "
+ "reached MAX PORTS_PER_ADDR");
+ }
+ NAT44_COMMON_STATS.num_subscribers++;
+#ifndef NO_BULK_LOGGING
+ cnat_update_bulk_range_cache(udb, ko->k.k.port,
+ BULKSIZE_FROM_VRFMAP(my_vrfmap));
+#endif /* #ifndef NO_BULK_LOGGING */
+ }
+
+ /*
+ * step 3:
+ * outside port is allocated for this src vrf/src ip addr
+ * 1)create a new entry in main db
+ * 2)setup cnat_out2in_hash key
+ * 3)setup cnat_in2out_hash key
+ */
+ db = cnat_create_main_db_entry_and_hash(ki, ko, udb);
+
+ translation_create_count ++;
+ db->vrfmap_index = my_vrfmap - cnat_map_by_vrf;
+
+ /*
+ * don't forget logging
+ * logging API is unconditional,
+ * logging configuration check is done inside the inline function
+ */
+
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ /* if session logging is enabled .. do not log as there is no
+ * traffic yet
+ */
+ if(PREDICT_FALSE(my_vrfmap->nf_logging_policy != SESSION_LOG_ENABLE)) {
+ cnat_nfv9_log_mapping_create(db, my_vrfmap
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ if(PREDICT_FALSE(my_vrfmap->syslog_logging_policy != SESSION_LOG_ENABLE)) {
+ cnat_syslog_nat44_mapping_create(db, my_vrfmap, 0
+#ifndef NO_BULK_LOGGING
+ , nfv9_log_req
+#endif
+ );
+ }
+ }
+
+ return db;
+}
+
+
+cnat_main_db_entry_t*
+dslite_main_db_lookup_entry(dslite_db_key_bucket_t *ki);
+
+cnat_user_db_entry_t*
+dslite_user_db_lookup_entry(dslite_db_key_bucket_t *uki);
+
+cnat_user_db_entry_t*
+dslite_user_db_create_entry(dslite_db_key_bucket_t *uki, u32 portmap_index);
+
+cnat_main_db_entry_t*
+dslite_create_main_db_entry_and_hash(dslite_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb);
+
+#ifdef TOBE_PORTED
+/*
+ * this function is called from config handler only
+ * to allocate a static port based db entry
+ *
+ * the actual mapped address and port are already specified
+ */
+cnat_main_db_entry_t*
+dslite_create_static_main_db_entry_v2 (dslite_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ dslite_table_entry_t *dslite_entry_ptr,
+ cnat_gen_icmp_info *info)
+{
+ u16 protocol;
+ u32 head;
+ cnat_errno_t rv;
+ dslite_db_key_bucket_t u_ki;
+ u32 my_index, free_main, free_user;
+ cnat_portmap_v2_t *pm =0;
+ cnat_portmap_v2_t *my_pm =0;
+ cnat_user_db_entry_t *udb = 0;
+ cnat_main_db_entry_t *db = 0;
+ pool_header_t *h;
+ u16 dslite_id = dslite_entry_ptr->dslite_id;
+#ifndef NO_BULK_LOGGING
+ int nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+ cnat_vrfmap_t *my_vrfmap =0;
+ u16 my_vrfmap_index;
+
+ /*
+ * need to try lookup again because
+ * second pkt may come here before the entry is created
+ * by receiving first pkt due to high line rate.
+ */
+ info->gen_icmp_msg = CNAT_NO_ICMP_MSG;
+ info->error = CNAT_SUCCESS;
+ db = dslite_main_db_lookup_entry(ki);
+
+ /*
+ * If we already have an entry with this inside address, port
+ * check delete the entry and proceed further. This should
+ * If yes, something is terribly wrong. Bail out
+ */
+ if (PREDICT_FALSE(db)) {
+
+ if (db->flags & CNAT_DB_FLAG_STATIC_PORT) {
+
+ if ((db->out2in_key.k.ipv4 == ko->k.k.ipv4) &&
+ (db->out2in_key.k.port == ko->k.k.port) &&
+ (db->out2in_key.k.vrf == ko->k.k.vrf)) {
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Same Static Port Exists ki 0x%16llx ko 0x%16llx",
+ ki->k, ko->k);
+#endif
+ /*
+ * We have already programmed this, return
+ */
+ return (db);
+ }
+
+ /*
+ * We already have a static port with different mapping
+ * Return an error for this case.
+ */
+ info->error = CNAT_ERR_PARSER;
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Static Port Existing and Diff ki 0x%16llx ko 0x%16llx",
+ ki, db->out2in_key);
+#endif
+ {
+ u32 arr[] = {STAT_PORT_CONFIG_IN_USE, (ki->dk.ipv4_key.k.vrf & CNAT_VRF_MASK),
+ ki->dk.ipv4_key.k.ipv4, ki->dk.ipv4_key.k.port, (ki->dk.ipv4_key.k.vrf & CNAT_PRO_MASK) };
+ spp_printf(CNAT_CONFIG_ERROR, 5, arr);
+ }
+ return (db);
+ }
+
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("Deleting Dynamic entry ki 0x%16llx ko 0x%16llx",
+ ki, db->out2in_key);
+#endif
+
+ /*
+ * If for some reason we have dynamic entries, just delete them
+ * and proceed.
+ */
+ cnat_delete_main_db_entry_v2(db);
+
+ db = NULL;
+ }
+
+
+ protocol = ki->dk.ipv4_key.k.vrf & CNAT_PRO_MASK;
+ u_ki.dk.ipv4_key.k.vrf = ki->dk.ipv4_key.k.vrf & CNAT_VRF_MASK;
+ u_ki.dk.ipv4_key.k.ipv4 = ki->dk.ipv4_key.k.ipv4;
+ u_ki.dk.ipv4_key.k.port = 0;
+ u_ki.dk.ipv6[0] = ki->dk.ipv6[0];
+ u_ki.dk.ipv6[1] = ki->dk.ipv6[1];
+ u_ki.dk.ipv6[2] = ki->dk.ipv6[2];
+ u_ki.dk.ipv6[3] = ki->dk.ipv6[3];
+
+ my_vrfmap_index = vrf_map_array[u_ki.dk.ipv4_key.k.vrf];
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+
+ pm = dslite_entry_ptr->portmap_list;
+
+ /*
+ * check if src vrf, src ip addr is already
+ * in the user db
+ * if yes, use PORT_ALLOC_DIRECTED
+ * if no, use PORT_ALLOC_ANY since it is first time
+ */
+ udb = dslite_user_db_lookup_entry(&u_ki);
+ if (PREDICT_TRUE(udb)) {
+ /*
+ * check if main db has space to accomodate new entry
+ */
+ h = pool_header(cnat_main_db);
+
+ free_main = vec_len(h->free_indices) - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+#ifdef UT_TEST_CODE
+ printf("Limit reached : OLD USER");
+#endif
+ spp_printf(CNAT_MAIN_DB_LIMIT_ERROR, 0, 0);
+ return NULL;
+ }
+
+ /*
+ * allocate port, from existing mapping
+ */
+ my_index = udb->portmap_index;
+ my_pm = pm + my_index;
+ /* It is quite possible that we hit the scenario of CSCtj17774.
+ * Delete all the main db entries and add the ipv4 address sent by
+ * CGN-MA as Static port alloc any
+ */
+
+ if (PREDICT_FALSE(my_pm->ipv4_address != ko->k.k.ipv4)) {
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ printf("Delete Main db entry and check for"
+ " ipv4 address sanity pm add = 0x%x ip add = 0x%x\n",
+ my_pm->ipv4_address, ko->k.k.ipv4);
+ }
+ do {
+ /* udb is not NULL when we begin with for sure */
+ head = udb->translation_list_head_index;
+ db = cnat_main_db + head;
+ cnat_delete_main_db_entry_v2(db);
+ } while (!pool_is_free(cnat_user_db, udb));
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_ANY, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(dslite_entry_ptr), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ DSLITE_DEBUG_INSIDE_ERR(rv)
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = dslite_user_db_create_entry(&u_ki, my_index);
+ nat44_dslite_common_stats[dslite_id].num_subscribers++;
+#ifndef NO_BULK_LOGGING
+ if(PREDICT_FALSE(udb && (BULK_ALLOC_NOT_ATTEMPTED != nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, ko->k.k.port,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr));
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ } else {
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_DIRECTED, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(dslite_entry_ptr), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ DSLITE_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->dk.ipv4_key));
+ return (NULL);
+ }
+
+ /*
+ * increment port in use for this user
+ */
+ udb->ntranslations += 1;
+ }
+ } else {
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ printf ("Static port alloc any\n");
+ }
+ /*
+ * first time allocate port for this user
+ */
+
+ /*
+ * Check if main db has space for new entry
+ * Allowing a user db entry to be created if main db is not free
+ * will cause a port to be allocated to that user, which results in
+ * wastage of that port, hence the check is done here.
+ */
+ h = pool_header(cnat_main_db);
+ free_main = vec_len(h->free_indices) - 1;
+ h = pool_header(cnat_user_db);
+ free_user = vec_len(h->free_indices) - 1;
+
+ /*
+ * If either main_db or user_db does not have entries
+ * bail out, with appropriate error
+ */
+ if (PREDICT_FALSE(!(free_main && free_user))) {
+ u32 log_error;
+ if(free_main) {
+ info->error = CNAT_USER_DB_LIMIT;
+ log_error = CNAT_USER_DB_LIMIT_ERROR;
+ } else {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ log_error = CNAT_MAIN_DB_LIMIT_ERROR;
+ }
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ DSLITE_DEBUG_INSIDE_ERR(info->error)
+ spp_printf(log_error, 0, 0);
+ return NULL;
+ }
+
+ rv = cnat_mapped_static_port_alloc_v2_bulk (pm,
+ PORT_ALLOC_ANY, &my_index, ko->k.k.ipv4, ko->k.k.port,
+ udb, BULKSIZE_FROM_VRFMAP(dslite_entry_ptr), &nfv9_log_req,
+ my_vrfmap->ip_n_to_1);
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ info->error = rv;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ DSLITE_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->dk.ipv4_key));
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = dslite_user_db_create_entry(&u_ki, my_index);
+ nat44_dslite_common_stats[dslite_id].num_subscribers++;
+#ifndef NO_BULK_LOGGING
+ if(PREDICT_FALSE(udb && (BULK_ALLOC_NOT_ATTEMPTED != nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, ko->k.k.port,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr));
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ }
+
+ /*
+ * step 3:
+ * outside port is allocated for this src vrf/src ip addr
+ * 1)create a new entry in main db
+ * 2)setup cnat_out2in_hash key
+ * 3)setup cnat_in2out_hash key
+ */
+ db = dslite_create_main_db_entry_and_hash(ki, ko, udb);
+ db->dslite_nat44_inst_id = dslite_id;
+ nat44_dslite_common_stats[dslite_id].active_translations++;
+ dslite_translation_create_count++;
+
+ /*
+ * don't forget logging
+ * logging API is unconditional,
+ * logging configuration check is done inside the inline function
+ */
+#if 0 /* TBD - NEED TO DECIDE ON LOGGING */
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ /* if session logging is enabled .. do not log as there is no
+ * traffic yet
+ */
+#endif /* #if 0 - this has to be removed later */
+
+ return db;
+}
+#endif /* TOBE_PORTED */
+
+
+/* Per port/ip timeout related routines */
+static
+u32 cnat_timeout_db_hash_lookup (cnat_key_t t_key)
+{
+ cnat_key_t key;
+ u64 a, b, c;
+ u32 index;
+ cnat_timeout_db_entry_t *db;
+
+ key.k.ipv4 = t_key.k.ipv4;
+ key.k.port = t_key.k.port;
+ key.k.vrf = t_key.k.vrf;
+
+ CNAT_V4_GET_HASH(key.key64,
+ index, CNAT_TIMEOUT_HASH_MASK)
+
+
+ index = cnat_timeout_hash[index].next;
+
+ if (PREDICT_FALSE(index == EMPTY))
+ return EMPTY;
+
+ do {
+ db = cnat_timeout_db + index;
+ if (PREDICT_TRUE((db->t_key.timeout_key.key64 & CNAT_TIMEOUT_FULL_MASK)
+ == (key.key64 & CNAT_TIMEOUT_FULL_MASK)))
+ break;
+ index = db->t_hash.next;
+ } while (index != EMPTY);
+
+ return index;
+}
+
+/* Pass db_type as MAIN_DB_TYPE if you are passing
+ * cnat_main_db_entry_t * casted as void * for db
+ * else pass db_type as SESSION_DB_TYPE
+ */
+u16
+query_and_update_db_timeout(void *db, u8 db_type)
+{
+ cnat_key_t t_search_key;
+ u32 index;
+ cnat_timeout_db_entry_t *timeout_db_entry;
+ pool_header_t *h;
+ u32 free;
+
+ cnat_main_db_entry_t *mdb = NULL;
+ cnat_session_entry_t *sdb = NULL;
+
+ if(PREDICT_TRUE(db_type == MAIN_DB_TYPE)) {
+ mdb = (cnat_main_db_entry_t *)db;
+ } else if(db_type == SESSION_DB_TYPE) {
+ sdb = (cnat_session_entry_t *)db;
+ } else {
+ return 0;
+ }
+
+ h = pool_header(cnat_timeout_db);
+ free = vec_len(h->free_indices) - 1;
+
+ if(free == CNAT_TIMEOUT_HASH_SIZE) {
+ /* No timeout db configured */
+ return 0;
+ }
+
+ /* First search for ip/port pair */
+ if(PREDICT_TRUE(db_type == MAIN_DB_TYPE)) {
+ t_search_key.k.ipv4 = mdb->dst_ipv4;
+ t_search_key.k.port = mdb->dst_port;
+ t_search_key.k.vrf = mdb->in2out_key.k.vrf;
+ } else {
+ t_search_key.k.ipv4 = sdb->v4_dest_key.k.ipv4;
+ t_search_key.k.port = sdb->v4_dest_key.k.port;
+ t_search_key.k.vrf = sdb->v4_dest_key.k.vrf;
+ }
+
+ index = cnat_timeout_db_hash_lookup(t_search_key);
+
+ if(index == EMPTY) {
+ /* Search for port map */
+ t_search_key.k.ipv4 = 0;
+
+ index = cnat_timeout_db_hash_lookup(t_search_key);
+
+ if(index == EMPTY) {
+ /* Search for ip only map */
+ if(PREDICT_TRUE(db_type == MAIN_DB_TYPE)) {
+ t_search_key.k.ipv4 = mdb->dst_ipv4;
+ } else {
+ t_search_key.k.ipv4 = sdb->v4_dest_key.k.ipv4;
+ }
+ t_search_key.k.port = 0;
+
+ index = cnat_timeout_db_hash_lookup(t_search_key);
+ if(index != EMPTY) {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("%s: ip only map sucess\n","query_and_update_db_timeout");
+#endif
+ }
+ } else {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("%s: port only map sucess\n", "query_and_update_db_timeout");
+#endif
+ }
+
+ } else {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("%s: ip port map sucess\n","query_and_update_db_timeout");
+#endif
+
+ }
+
+ if(index == EMPTY) {
+ /* No match found, clear timeout */
+ if(PREDICT_TRUE(db_type == MAIN_DB_TYPE)) {
+ mdb->timeout = 0;
+ } else {
+ sdb->timeout = 0;
+ }
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("%s: No match\n","query_and_update_db_timeout");
+#endif
+ } else {
+ /* Match found, update timeout */
+ timeout_db_entry = cnat_timeout_db + index;
+ if(PREDICT_TRUE(db_type == MAIN_DB_TYPE)) {
+ mdb->timeout = timeout_db_entry->t_key.timeout_value;
+ } else {
+ sdb->timeout = timeout_db_entry->t_key.timeout_value;
+ }
+ return timeout_db_entry->t_key.timeout_value;
+ }
+ return 0;
+}
+
+
+
+static
+void cnat_timeout_db_hash_add (cnat_timeout_db_entry_t *t_entry)
+{
+ cnat_key_t key;
+ u64 a, b, c;
+ u32 index, bucket;
+ cnat_key_t t_key = t_entry->t_key.timeout_key;
+
+ key.k.ipv4 = t_key.k.ipv4;
+ key.k.port = t_key.k.port;
+ key.k.vrf = t_key.k.vrf;
+
+ CNAT_V4_GET_HASH(key.key64,
+ bucket, CNAT_TIMEOUT_HASH_MASK)
+
+
+ index = cnat_timeout_hash[bucket].next;
+
+ /* Add this db entry to the head of the bucket chain */
+ t_entry->t_hash.next = index;
+ cnat_timeout_hash[bucket].next = t_entry - cnat_timeout_db;
+}
+
+
+
+u16
+cnat_timeout_db_create (cnat_timeout_t t_entry)
+{
+ cnat_timeout_db_entry_t *db;
+ cnat_key_t t_key = t_entry.timeout_key;
+ u32 db_index;
+
+ pool_header_t *h;
+ u32 free;
+
+ db_index = cnat_timeout_db_hash_lookup(t_key);
+
+ if(db_index != EMPTY) {
+ /* Entry already exists. Check if it is replay or update */
+ db = cnat_timeout_db + db_index;
+ db->t_key.timeout_value = t_entry.timeout_value;
+ return CNAT_SUCCESS;
+ }
+
+ h = pool_header(cnat_timeout_db);
+ free = vec_len(h->free_indices) - 1;
+
+ if(free == 0) {
+ return CNAT_OUT_LIMIT;
+ }
+
+
+ pool_get(cnat_timeout_db, db);
+ ASSERT(db);
+
+ memset(db, 0, sizeof(*db));
+
+ db_index = db - cnat_timeout_db;
+
+ db->t_key.timeout_key.k.ipv4 = t_key.k.ipv4;
+ db->t_key.timeout_key.k.port = t_key.k.port;
+ db->t_key.timeout_key.k.vrf = t_key.k.vrf;
+ db->t_key.timeout_value = t_entry.timeout_value;
+
+
+ cnat_timeout_db_hash_add(db);
+ return CNAT_SUCCESS;
+}
+
+void cnat_timeout_db_delete(cnat_key_t t_key)
+{
+ cnat_key_t key;
+ u64 a, b, c;
+ u32 index, bucket;
+ cnat_timeout_db_entry_t *this, *prev;
+
+ key.k.ipv4 = t_key.k.ipv4;
+ key.k.port = t_key.k.port;
+ key.k.vrf = t_key.k.vrf;
+
+
+ CNAT_V4_GET_HASH(key.key64,
+ bucket, CNAT_TIMEOUT_HASH_MASK)
+
+
+ index = cnat_timeout_hash[bucket].next;
+
+ if(index == EMPTY) return;
+
+ prev = 0;
+ do {
+ this = cnat_timeout_db + index;
+ if (PREDICT_TRUE(
+ (this->t_key.timeout_key.key64 & CNAT_TIMEOUT_FULL_MASK) ==
+ (key.key64 & CNAT_TIMEOUT_FULL_MASK))) {
+ if (prev == 0) {
+ cnat_timeout_hash[bucket].next = this->t_hash.next;
+ goto found;
+ } else {
+ prev->t_hash.next = this->t_hash.next;
+ goto found;
+ }
+ }
+
+ prev = this;
+ index = this->t_hash.next;
+ } while (index != EMPTY);
+
+ if(index == EMPTY) return;
+
+ found:
+ pool_put(cnat_timeout_db, this);
+
+}
+
+void cnat_session_db_hash_delete (cnat_session_entry_t *ep)
+{
+ u32 a, b, c;
+ u32 index, bucket;
+ cnat_session_entry_t *this, *prev;
+
+ CNAT_V4_GET_SESSION_HASH(ep->main_db_index, ep->v4_dest_key.k.ipv4,
+ ep->v4_dest_key.k.port, ep->v4_dest_key.k.vrf, bucket,
+ CNAT_SESSION_HASH_MASK)
+
+
+ index = cnat_session_hash[bucket].next;
+
+ ASSERT(index != EMPTY);
+
+ prev = 0;
+ do {
+ this = cnat_session_db + index;
+ if (PREDICT_TRUE(this == ep)) {
+ if (prev == 0) {
+ cnat_session_hash[bucket].next =
+ ep->cnat_session_hash.next;
+ return;
+ } else {
+ prev->cnat_session_hash.next =
+ ep->cnat_session_hash.next;
+ return;
+ }
+ }
+ prev = this;
+ index = this->cnat_session_hash.next;
+ } while (index != EMPTY);
+
+ ASSERT(0);
+
+}
+
+cnat_session_entry_t *
+cnat_session_db_edm_lookup_entry(cnat_key_t *ko,u32 session_head_index,
+ u32 main_db_index)
+{
+ u32 index;
+ cnat_session_entry_t *db;
+
+
+ index = session_head_index;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ return (NULL);
+ }
+
+ do {
+ db = cnat_session_db + index;
+ if(PREDICT_TRUE((db->main_db_index == main_db_index) &&
+ (db->v4_dest_key.k.vrf == ko->k.vrf) &&
+ (db->v4_dest_key.k.ipv4 == ko->k.ipv4))) {
+
+ return db;
+ }
+ index = db->cnat_session_hash.next;
+ } while (index != EMPTY);
+
+ return (NULL);
+}
+
+
+
+cnat_session_entry_t *
+cnat_session_db_lookup_entry(cnat_key_t *ko,u32 main_db_index)
+{
+ u32 a, b, c;
+ u32 index, bucket;
+ cnat_session_entry_t *db;
+
+ CNAT_V4_GET_SESSION_HASH(main_db_index, ko->k.ipv4, ko->k.port,
+ ko->k.vrf, bucket, CNAT_SESSION_HASH_MASK)
+
+
+ index = cnat_session_hash[bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ return (NULL);
+ }
+
+ do {
+ db = cnat_session_db + index;
+ if(PREDICT_TRUE((db->main_db_index == main_db_index) &&
+ (db->v4_dest_key.k.vrf == ko->k.vrf) &&
+ (db->v4_dest_key.k.port == ko->k.port) &&
+ (db->v4_dest_key.k.ipv4 == ko->k.ipv4))) {
+
+ return db;
+ }
+ index = db->cnat_session_hash.next;
+ } while (index != EMPTY);
+
+ return (NULL);
+}
+
+cnat_session_entry_t *
+cnat_create_session_db_entry(cnat_key_t *ko,
+ cnat_main_db_entry_t *bdb, u8 log)
+{
+ u32 a, b, c;
+ u32 db_index, bucket_out;
+ cnat_session_entry_t *db = NULL;
+ pool_header_t *h;
+ u32 free_session;
+ u16 instance;
+
+ db = cnat_session_db_lookup_entry(ko, bdb - cnat_main_db);
+ if (PREDICT_FALSE(db != NULL)) {
+ /*printf("Create Session - Entry already Exists\n");*/
+ return db;
+ }
+
+ h = pool_header(cnat_session_db);
+ free_session = vec_len(h->free_indices) - 1;
+
+ if (bdb->flags & CNAT_DB_DSLITE_FLAG) {
+ instance = bdb->dslite_nat44_inst_id;
+ } else {
+ instance = NAT44_RESERVED_INST_ID;
+ }
+
+ if (PREDICT_FALSE(!free_session)) {
+ nat44_dslite_common_stats[instance].drops_sessiondb_limit_exceeded++;
+ return NULL;
+ }
+
+ if( PREDICT_FALSE(bdb->nsessions == CNAT_MAX_SESSIONS_PER_BIB)) {
+ /* printf("Create Session - Max sessions per BIB reached\n"); */
+ return NULL;
+ }
+
+ pool_get(cnat_session_db, db);
+ memset(db, 0, sizeof(*db));
+
+ db_index = db - cnat_session_db;
+ db->v4_dest_key.k.port = ko->k.port;
+ db->v4_dest_key.k.ipv4 = ko->k.ipv4;
+ db->v4_dest_key.k.vrf = ko->k.vrf;
+
+ db->main_list.next = db_index;
+ db->main_list.prev = db_index;
+ db->main_db_index = bdb - cnat_main_db;
+
+ db->tcp_seq_num = 0;
+ db->ack_no = 0;
+ db->window = 0;
+
+ if(PREDICT_FALSE(log)) {
+ bdb->nsessions++;
+ query_and_update_db_timeout(db, SESSION_DB_TYPE);
+ }
+
+ if (PREDICT_FALSE(bdb->nsessions == 1)) {
+ /*
+ * first port for this src vrf/src ip addr
+ */
+ bdb->session_head_index = db_index;
+ } else {
+ index_dlist_addtail(bdb->session_head_index,
+ (u8 *)cnat_session_db, sizeof(cnat_session_db[0]),
+ STRUCT_OFFSET_OF(cnat_session_entry_t, main_list),
+ db_index);
+ }
+
+ /*
+ * setup o2i hash key
+ */
+ CNAT_V4_GET_SESSION_HASH(db->main_db_index, ko->k.ipv4, ko->k.port,
+ ko->k.vrf, bucket_out, CNAT_SESSION_HASH_MASK)
+
+
+ db->cnat_session_hash.next =
+ cnat_session_hash[bucket_out].next;
+ cnat_session_hash[bucket_out].next = db_index;
+
+
+ if(PREDICT_FALSE(log)) {
+ if (bdb->flags & CNAT_DB_DSLITE_FLAG) {
+ cnat_session_log_ds_lite_mapping_create(bdb,
+ (dslite_table_db_ptr + instance),db);
+ } else {
+ cnat_vrfmap_t *my_vrfmap = cnat_map_by_vrf + bdb->vrfmap_index;
+ cnat_session_log_nat44_mapping_create(bdb, db, my_vrfmap);
+ }
+ }
+
+ /* Need to set entry_expires here, as we need to override 0 check for
+ newly established sessions */
+ db->entry_expires = cnat_current_time;
+ nat44_dslite_common_stats[instance].sessions++;
+ return db;
+}
+
+void
+cnat_dest_update_main2session(cnat_main_db_entry_t *mdb,
+ cnat_session_entry_t *sdb)
+{
+
+ sdb->flags = mdb->flags;
+ sdb->timeout = mdb->timeout;
+ sdb->entry_expires = mdb->entry_expires;
+ sdb->alg.delta = mdb->alg.delta;
+ sdb->tcp_seq_num = mdb->proto_data.seq_pcp.tcp_seq_num;
+
+ /* Reset Main db values to 0 */
+ /* Reset only session specific flags */
+ mdb->flags &= ~(CNAT_DB_FLAG_TCP_ACTIVE | CNAT_DB_FLAG_UDP_ACTIVE
+ | CNAT_DB_FLAG_ALG_ENTRY | CNAT_DB_FLAG_ALG_CTRL_FLOW);
+ mdb->timeout = 0;
+ mdb->entry_expires = 0;
+ mdb->alg.delta = 0;
+ if(PREDICT_FALSE(!((mdb->flags & CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE) ||
+ (mdb->flags & CNAT_DB_FLAG_PPTP_TUNNEL_INIT)))) {
+ mdb->proto_data.seq_pcp.tcp_seq_num = 0;
+ }
+
+ mdb->dst_ipv4 = 0;
+ mdb->dst_port = 0;
+}
+
+
+void
+cnat_dest_update_session2main(cnat_main_db_entry_t *mdb,
+ cnat_session_entry_t *sdb)
+{
+
+ u16 flags = sdb->flags & (CNAT_DB_FLAG_TCP_ACTIVE |
+ CNAT_DB_FLAG_UDP_ACTIVE | CNAT_DB_FLAG_ALG_ENTRY |
+ CNAT_DB_FLAG_ALG_CTRL_FLOW);
+ mdb->flags |= flags;
+ mdb->timeout = sdb->timeout;
+ mdb->entry_expires = sdb->entry_expires;
+ mdb->alg.delta = sdb->alg.delta;
+ if(PREDICT_FALSE(!((mdb->flags & CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE) ||
+ (mdb->flags & CNAT_DB_FLAG_PPTP_TUNNEL_INIT)))) {
+ mdb->proto_data.seq_pcp.tcp_seq_num = sdb->tcp_seq_num;
+ }
+ mdb->dst_ipv4 = sdb->v4_dest_key.k.ipv4;
+ mdb->dst_port = sdb->v4_dest_key.k.port;
+}
+
+void cnat_delete_session_db_entry (cnat_session_entry_t *ep, u8 log)
+{
+ u32 session_db_index;
+ u32 bdb_len;
+ cnat_main_db_entry_t *be =0;
+ cnat_session_entry_t *sdb_last = NULL;
+ u16 instance;
+
+ if (PREDICT_FALSE(ep->flags & CNAT_DB_NAT64_FLAG) != 0) {
+ /* Preventive check - Not a NAT44 entry */
+ return;
+ }
+
+ pool_header_t *h = pool_header(cnat_main_db);
+
+ /* Validate .. just in case we are trying to delete a non existing one */
+ bdb_len = vec_len(cnat_main_db);
+
+ /* In case of invalid user just return, deleting only main db
+ * is not a good idea, since some valid user db entry might be pointing
+ * to that main db and hence leave the dbs in a inconsistent state
+ */
+ if (PREDICT_FALSE((ep->main_db_index >= bdb_len) ||
+ (clib_bitmap_get(h->free_bitmap, ep->main_db_index)))) {
+#ifdef DEBUG_PRINTF_ENABLED
+ printf("invalid/unused user index in db %d\n", ep->main_db_index);
+#endif
+ spp_printf(CNAT_INV_UNUSED_USR_INDEX, 1, (u32 *) &(ep->main_db_index));
+ return;
+ }
+
+ be = cnat_main_db + ep->main_db_index;
+
+ session_db_index = ep - cnat_session_db;
+
+ be->session_head_index = index_dlist_remelem (
+ be->session_head_index, (u8 *)cnat_session_db,
+ sizeof (cnat_session_db[0]),
+ STRUCT_OFFSET_OF(cnat_session_entry_t, main_list),
+ session_db_index);
+
+ if (be->flags & CNAT_DB_DSLITE_FLAG) {
+ instance = be->dslite_nat44_inst_id;
+ } else {
+ instance = NAT44_RESERVED_INST_ID;
+ }
+
+ if(PREDICT_TRUE(log)) {
+ if (be->flags & CNAT_DB_DSLITE_FLAG) {
+ cnat_session_log_ds_lite_mapping_delete(be,
+ (dslite_table_db_ptr + instance),ep);
+ } else {
+ cnat_vrfmap_t *my_vrfmap = cnat_map_by_vrf + be->vrfmap_index;
+ cnat_session_log_nat44_mapping_delete(be, ep, my_vrfmap);
+ }
+ be->nsessions--;
+ }
+
+ if (PREDICT_FALSE(be->nsessions == 1 && log)) {
+ /* There is only 1 session left
+ * Copy the info back to main db and release the last
+ * existing session
+ */
+
+ sdb_last = cnat_session_db + be->session_head_index;
+ ASSERT(sdb_last != NULL);
+
+ cnat_dest_update_session2main(be, sdb_last);
+ cnat_delete_session_db_entry(sdb_last, FALSE);
+ }
+
+ /* Remove from session DB hashes */
+ cnat_session_db_hash_delete(ep);
+ nat44_dslite_common_stats[instance].sessions--;
+
+ pool_put(cnat_session_db, ep);
+}
+
+cnat_main_db_entry_t*
+dslite_main_db_lookup_entry(dslite_db_key_bucket_t *ki)
+{
+ u64 a, b, c;
+ u32 index;
+ cnat_main_db_entry_t *db;
+ cnat_user_db_entry_t *userdb;
+
+ DSLITE_V6_GET_HASH((&(ki->dk)),
+ ki->bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ DSLITE_PRINTF(1,"MDBLU hash..%u\n", ki->bucket);
+
+ index = cnat_in2out_hash[ki->bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ DSLITE_PRINTF(1,"MDBLU index MT..\n");
+ return (NULL);
+ }
+
+ do {
+/* We can add a flag here to indicate if the db entry is for nat44 or
+ * dslite. If the db entry is for nat44 then we can simply move to the
+ * one.
+ */
+ db = cnat_main_db + index;
+ userdb = cnat_user_db + db->user_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 == ki->dk.ipv4_key.key64)
+ && userdb->ipv6[0] == ki->dk.ipv6[0]
+ && userdb->ipv6[1] == ki->dk.ipv6[1]
+ && userdb->ipv6[2] == ki->dk.ipv6[2]
+ && userdb->ipv6[3] == ki->dk.ipv6[3]) {
+ DSLITE_PRINTF(1,"MDBLU success..%u\n", index);
+ return db;
+ }
+ index = db->in2out_hash.next;
+ } while (index != EMPTY);
+
+ DSLITE_PRINTF(1,"MDBLU Entry does not exist..\n");
+ return (NULL);
+}
+
+cnat_user_db_entry_t*
+dslite_user_db_lookup_entry(dslite_db_key_bucket_t *uki)
+{
+ u64 a, b, c;
+ u32 index;
+ cnat_user_db_entry_t *udb=NULL;
+
+ DSLITE_V6_GET_HASH((&(uki->dk)),
+ uki->bucket,
+ CNAT_USER_HASH_MASK)
+
+ DSLITE_PRINTF(1,"UDBLU hash..%u\n", uki->bucket);
+
+ /* now: index in user vector */
+ index = cnat_user_hash[uki->bucket].next;
+ if (PREDICT_TRUE(index != EMPTY)) {
+ DSLITE_PRINTF(1,"UDBLU hash table entry not MT..\n");
+ do {
+ udb = cnat_user_db + index;
+ if (PREDICT_FALSE(udb->key.key64 == uki->dk.ipv4_key.key64)
+ && udb->ipv6[0] == uki->dk.ipv6[0]
+ && udb->ipv6[1] == uki->dk.ipv6[1]
+ && udb->ipv6[2] == uki->dk.ipv6[2]
+ && udb->ipv6[3] == uki->dk.ipv6[3]) {
+ DSLITE_PRINTF(1,"UDBLU success..%u\n", index);
+ return udb;
+ }
+ index = udb->user_hash.next;
+ } while (index != EMPTY);
+ }
+ DSLITE_PRINTF(1,"UDBLU Entry doesnt exist..\n");
+ return (NULL);
+}
+
+cnat_user_db_entry_t*
+dslite_user_db_create_entry(dslite_db_key_bucket_t *uki,
+ u32 portmap_index)
+{
+ cnat_user_db_entry_t *udb = NULL;
+
+ pool_get(cnat_user_db, udb);
+ memset(udb, 0, sizeof(*udb));
+
+ udb->ntranslations = 1;
+ udb->portmap_index = portmap_index;
+// udb->key.key64 = uki->k.key64;
+
+ udb->key.key64 = uki->dk.ipv4_key.key64;
+ udb->ipv6[0] = uki->dk.ipv6[0];
+ udb->ipv6[1] = uki->dk.ipv6[1];
+ udb->ipv6[2] = uki->dk.ipv6[2];
+ udb->ipv6[3] = uki->dk.ipv6[3];
+
+ udb->flags |= CNAT_USER_DB_DSLITE_FLAG;
+ /* Add this user to the head of the bucket chain */
+ udb->user_hash.next =
+ cnat_user_hash[uki->bucket].next;
+ cnat_user_hash[uki->bucket].next = udb - cnat_user_db;
+
+#ifndef NO_BULK_LOGGING
+ INIT_BULK_CACHE(udb)
+#endif /* NO_BULK_LOGGING */
+
+ return udb;
+}
+
+#ifndef TOBE_PORTED
+cnat_main_db_entry_t*
+dslite_create_main_db_entry_and_hash(dslite_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb)
+{
+ return 0;
+}
+#else
+cnat_main_db_entry_t*
+dslite_create_main_db_entry_and_hash(dslite_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb)
+{
+ u64 a, b, c;
+ u32 db_index;
+ cnat_main_db_entry_t *db = NULL;
+
+ pool_get(cnat_main_db, db);
+ memset(db, 0, sizeof(*db));
+
+ db_index = db - cnat_main_db;
+ db->in2out_key.k.ipv4 = ki->dk.ipv4_key.k.ipv4;
+ db->in2out_key.k.port = ki->dk.ipv4_key.k.port;
+ db->in2out_key.k.vrf = ki->dk.ipv4_key.k.vrf;
+ db->out2in_key.k.ipv4 = ko->k.k.ipv4;
+ db->out2in_key.k.port = ko->k.k.port;
+ db->out2in_key.k.vrf = ko->k.k.vrf;
+
+ db->user_ports.next = db_index;
+ db->user_ports.prev = db_index;
+ db->user_index = udb - cnat_user_db;
+ //db->portmap_index = udb->portmap_index;
+ db->flags |= CNAT_DB_DSLITE_FLAG;
+
+ if (PREDICT_FALSE(udb->ntranslations == 1)) {
+ /*
+ * first port for this src vrf/src ip addr
+ */
+ udb->translation_list_head_index = db_index;
+ DSLITE_PRINTF(1,"First translation of this user..\n");
+ } else {
+ index_dlist_addtail(udb->translation_list_head_index,
+ (u8 *)cnat_main_db, sizeof(cnat_main_db[0]),
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports),
+ db_index);
+ }
+
+ /*
+ * setup o2i hash key
+ */
+ CNAT_V4_GET_HASH(ko->k.key64,
+ ko->bucket,
+ CNAT_MAIN_HASH_MASK)
+ db->out2in_hash.next = cnat_out2in_hash[ko->bucket].next;
+ cnat_out2in_hash[ko->bucket].next = db_index;
+ /*
+ * setup i2o hash key, bucket is already calculate
+ */
+ db->in2out_hash.next = cnat_in2out_hash[ki->bucket].next;
+ cnat_in2out_hash[ki->bucket].next = db_index;
+
+ DSLITE_PRINTF(1,"Create main db and hash..%u %u %u %u %x\n",
+ ki->bucket, ko->bucket,
+ db_index, db->user_index, ko->k.key64);
+
+#if DEBUG > 1
+ printf("\nMy_Instance_Number %d: Bucket %d, Db_Index %d",
+ my_instance_number, ki->bucket, db_index);
+ printf("\nInside (VRF 0x%x, IP 0x%x, PORT 0x%x)",
+ db->in2out_key.k.vrf, db->in2out_key.k.ipv4, db->in2out_key.k.port);
+ printf("\nOutside (VRF 0x%x, IP 0x%x, PORT 0x%x)",
+ db->out2in_key.k.vrf, db->out2in_key.k.ipv4, db->out2in_key.k.port);
+ printf("\nUser Index %d, IP 0x%x",
+ db->user_index, udb->key.k.ipv4);
+#endif
+
+ //nat44_dslite_common_stats[DSLITE_COMMON_STATS].active_translations++;
+
+ return db;
+}
+
+static inline void handle_dslite_port_exceeded_logging(
+ cnat_user_db_entry_t *udb,
+ dslite_key_t * key,
+ dslite_table_entry_t *dslite_entry_ptr)
+{
+
+ if(PREDICT_TRUE(udb->flags & CNAT_USER_DB_PORT_LIMIT_EXCEEDED)) {
+ /* Already logged ..*/
+ return;
+ }
+
+ /* else, set the flag and call the log API */
+ udb->flags = udb->flags | CNAT_USER_DB_PORT_LIMIT_EXCEEDED;
+ cnat_log_ds_lite_port_limit_exceeded(key, dslite_entry_ptr);
+ return;
+}
+#endif
+
+inline void handle_cnat_port_exceeded_logging(
+ cnat_user_db_entry_t *udb,
+ cnat_key_t * key,
+ cnat_vrfmap_t *vrfmap)
+{
+
+ if(PREDICT_TRUE(udb->flags & CNAT_USER_DB_PORT_LIMIT_EXCEEDED)) {
+ /* Already logged ..*/
+ return;
+ }
+
+ /* else, set the flag and call the log API */
+ udb->flags = udb->flags | CNAT_USER_DB_PORT_LIMIT_EXCEEDED;
+ cnat_log_nat44_port_limit_exceeded(key,vrfmap);
+ return;
+}
+
+#ifndef TOBE_PORTED
+cnat_main_db_entry_t*
+dslite_get_main_db_entry_v2(dslite_db_key_bucket_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info,
+ dslite_table_entry_t *dslite_entry_ptr,
+ cnat_key_t *dest_info)
+{
+ return 0;
+}
+#else
+/*
+ * this function is called by exception node
+ * when lookup is fialed in i2o node
+ *
+ * if reash per user port limit,
+ * set user_db_entry pointer, and error == CNAT_OUT_LIMIT
+ */
+cnat_main_db_entry_t*
+dslite_get_main_db_entry_v2(dslite_db_key_bucket_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info,
+ dslite_table_entry_t *dslite_entry_ptr,
+ cnat_key_t *dest_info)
+{
+ u16 protocol;
+ cnat_errno_t rv;
+ dslite_db_key_bucket_t u_ki;
+ cnat_db_key_bucket_t ko;
+ u32 my_index, free_main, free_user;
+ u32 current_timestamp;
+ cnat_vrfmap_t *my_vrfmap =0;
+ u16 my_vrfmap_index;
+ cnat_portmap_v2_t *pm =0;
+ cnat_user_db_entry_t *udb = 0;
+ cnat_main_db_entry_t *db = 0;
+ pool_header_t *h;
+ u16 dslite_id = dslite_entry_ptr->dslite_id;
+
+#ifndef NO_BULK_LOGGING
+ int nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+ /*
+ * need to try lookup again because
+ * second pkt may come here before the entry is created
+ * by receiving first pkt due to high line rate.
+ */
+ info->gen_icmp_msg = CNAT_NO_ICMP_MSG;
+ info->error = CNAT_SUCCESS;
+ db = dslite_main_db_lookup_entry(ki);
+ if (PREDICT_TRUE(db)) {
+ /* what if the source is talking to a
+ * new dest now? We will have to handle this case and
+ * take care of - creating session db and logging
+ */
+ if(PREDICT_FALSE((!dest_info->k.ipv4) && (!dest_info->k.port))) {
+ return db; /* if dest_info is null don't create session */
+ }
+
+ if(PREDICT_TRUE((db->dst_ipv4 == dest_info->k.ipv4) &&
+ (db->dst_port == dest_info->k.port))) {
+ return db;
+ }
+ dest_info->k.vrf = db->in2out_key.k.vrf;
+ /* Src is indeed talking to a different dest */
+ cnat_session_entry_t *session_db2 = NULL;
+ if(PREDICT_TRUE(db->nsessions == 1)) {
+ session_db2 = cnat_handle_1to2_session(db, dest_info);
+ if(PREDICT_TRUE(session_db2 != NULL)) {
+ CNAT_DB_TIMEOUT_RST(session_db2);
+ return db;
+ } else {
+ info->error = CNAT_ERR_NO_SESSION_DB;
+ return NULL;
+ }
+ } else if(PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be static entry.. should never happen
+ */
+ if(PREDICT_TRUE(dest_info->k.ipv4 != 0)) {
+ cnat_add_dest_n_log(db, dest_info);
+ }
+ return db;
+ } else {
+ /* The src has already created multiple sessions.. very rare
+ */
+ session_db2 = cnat_create_session_db_entry(dest_info,
+ db, TRUE);
+ if(PREDICT_TRUE(session_db2 != NULL)) {
+ CNAT_DB_TIMEOUT_RST(session_db2);
+ return db;
+ } else {
+ info->error = CNAT_ERR_NO_SESSION_DB;
+ return NULL;
+ }
+ }
+
+ }
+
+ /*
+ * step 1. check if outside vrf is configured or not
+ * and Find the set of portmaps for the outside vrf
+ * insider vrf is one to one mappted to outside vrf
+ * key is vrf and ip only
+ * ki.k.k.vrf has protocol bits, mask out
+ */
+ protocol = ki->dk.ipv4_key.k.vrf & CNAT_PRO_MASK;
+ u_ki.dk.ipv4_key.k.vrf = ki->dk.ipv4_key.k.vrf & CNAT_VRF_MASK;
+#ifdef DSLITE_USER_IPV4
+ u_ki.dk.ipv4_key.k.ipv4 = ki->dk.ipv4_key.k.ipv4;
+#else
+ /*
+ * Inside ipv4 address should be masked, if port limit
+ * need to be done at B4 element level.
+ */
+ u_ki.dk.ipv4_key.k.ipv4 = 0;
+#endif
+ u_ki.dk.ipv4_key.k.port = 0;
+
+ u_ki.dk.ipv6[0] = ki->dk.ipv6[0];
+ u_ki.dk.ipv6[1] = ki->dk.ipv6[1];
+ u_ki.dk.ipv6[2] = ki->dk.ipv6[2];
+ u_ki.dk.ipv6[3] = ki->dk.ipv6[3];
+
+ my_vrfmap_index = vrf_map_array[u_ki.dk.ipv4_key.k.vrf];
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+/* Checking if the inst entry is active or not is done much earlier
+ */
+#if 0
+ my_vrfmap_index = vrf_map_array[u_ki.k.k.vrf];
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+ my_vrfmap_entry_found = ((my_vrfmap_index != VRF_MAP_ENTRY_EMPTY) &&
+ (my_vrfmap->status == S_RUN) &&
+ (my_vrfmap->i_vrf == u_ki.k.k.vrf));
+
+ if (PREDICT_FALSE(!my_vrfmap_entry_found)) {
+ u32 arr[] = {ki->k.k.vrf, ki->k.k.ipv4, ki->k.k.port};
+ if ((my_vrfmap_index == VRF_MAP_ENTRY_EMPTY) ||
+ (my_vrfmap->i_vrf == u_ki.k.k.vrf)) {
+ info->error = CNAT_NO_CONFIG;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_NO_CONFIG)
+ spp_printf(CNAT_NO_CONFIG_ERROR, 3, arr);
+ } else {
+ info->error = CNAT_NO_VRF_RUN;
+ CNAT_DEBUG_INSIDE_ERR(CNAT_NO_VRF_RUN)
+ spp_printf(CNAT_NO_VRF_RUN_ERROR, 3, arr);
+ }
+
+ return (NULL);
+ }
+#endif
+/*
+ dslite_inst_ptr = dslite_nat44_config_table[dslite_inst_id];
+*/
+ pm = dslite_entry_ptr->portmap_list;
+ //pm = my_vrfmap->portmap_list;
+
+ /*
+ * set o2i key with protocl bits
+ */
+ ko.k.k.vrf = dslite_entry_ptr->o_vrf | protocol;
+ //ko.k.k.vrf = my_vrfmap->o_vrf | protocol;
+
+ /*
+ * step 2. check if src vrf, src ip addr is alreay
+ * in the user db
+ * if yes, use PORT_ALLOC_DIRECTED
+ * if no, use PORT_ALLOC_ANY since it is first time
+ */
+ udb = dslite_user_db_lookup_entry(&u_ki);
+ if (PREDICT_TRUE(udb)) {
+ /*
+ * not first time allocate port for this user
+ * check limit
+ */
+ if (PREDICT_FALSE(udb->ntranslations >=
+ dslite_entry_ptr->cnat_main_db_max_ports_per_user)) {
+ //cnat_main_db_max_ports_per_user))
+
+ /* Check for the port type here. If we are getting
+ * a STATIC PORT, allow the config.
+ */
+ if (PREDICT_TRUE(port_type != PORT_TYPE_STATIC)) {
+ info->error = CNAT_OUT_LIMIT;
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_OUT_LIMIT)
+ port_exceeded_msg_log(u_ki.dk.ipv4_key.k.ipv4, u_ki.dk.ipv4_key.k.vrf);
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ u_ki.dk.ipv4_key.k.vrf = ki->dk.ipv4_key.k.vrf;
+ u_ki.dk.ipv4_key.k.port = ki->dk.ipv4_key.k.port;
+ handle_dslite_port_exceeded_logging(udb, &u_ki.dk, dslite_entry_ptr);
+ return (NULL);
+ }
+ }
+
+ CHECK_CLEAR_PORT_LIMIT_EXCEED_FLAG(udb,
+ dslite_entry_ptr->cnat_main_db_max_ports_per_user)
+
+ /*
+ * check if main db has space to accomodate new entry
+ */
+ h = pool_header(cnat_main_db);
+
+ free_main = vec_len(h->free_indices) - 1;
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_system_limit_reached ++;
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ current_timestamp = spp_trace_log_get_unix_time_in_seconds();
+ if (PREDICT_FALSE((current_timestamp - last_log_timestamp) >
+ 1800)) {
+ spp_printf(CNAT_SESSION_THRESH_EXCEEDED, 0, NULL);
+ last_log_timestamp = current_timestamp;
+ }
+
+#ifdef UT_TEST_CODE
+ printf("Limit reached : OLD USER");
+#endif
+ return NULL;
+ }
+
+ /*
+ * allocate port, from existing mapping
+ */
+ my_index = udb->portmap_index;
+
+ if (PREDICT_FALSE(port_type == PORT_TYPE_STATIC)) {
+ rv = cnat_static_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ ki->dk.ipv4_key.k.ipv4,
+ ki->dk.ipv4_key.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , udb,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1
+ );
+ } else if (PREDICT_TRUE(port_type != PORT_TYPE_RTSP) ) {
+
+ rv = cnat_dynamic_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , udb,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(dslite_entry_ptr->rseed_ip)
+ );
+ DSLITE_PRINTF(1,"D_PORT_ALLOC %x %u\n", ko.k.k.ipv4, ko.k.k.port);
+ } else {
+ /*
+ * For RTSP, two translation entries are created,
+ * check if main db has space to accomodate two new entry
+ */
+ free_main = free_main - 1;
+
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_system_limit_reached ++;
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ return NULL;
+ } else {
+
+ rv = cnat_dynamic_port_alloc_rtsp_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ ki->dk.ipv4_key.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , udb,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , &(dslite_entry_ptr->rseed_ip)
+ );
+ }
+ }
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ DSLITE_PRINTF(1,"D_PORT_ALLOC port alloc error\n");
+ info->error = rv;
+ DSLITE_DEBUG_INSIDE_ERR(rv)
+ nat44_dslite_common_stats[dslite_id].in2out_drops_resource_depletion ++;
+ log_port_alloc_error(rv, &(ki->dk.ipv4_key));
+ return (NULL);
+ }
+ /*
+ * increment port in use for this user
+ */
+ udb->ntranslations += 1;
+ } else {
+ /*
+ * first time allocate port for this user
+ */
+
+ /*
+ * Do not create entry if port limit is invalid
+ */
+ if (PREDICT_FALSE(!(dslite_entry_ptr->cnat_main_db_max_ports_per_user))) {
+ if (PREDICT_TRUE(port_type != PORT_TYPE_STATIC)) {
+ info->error = CNAT_OUT_LIMIT;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_port_limit_exceeded ++;
+ port_exceeded_msg_log(u_ki.dk.ipv4_key.k.ipv4, u_ki.dk.ipv4_key.k.vrf);
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_OUT_LIMIT)
+ return (NULL);
+ }
+ }
+
+ /*
+ * Check if main db has space for new entry
+ * Allowing a user db entry to be created if main db is not free
+ * will cause a port to be allocated to that user, which results in
+ * wastage of that port, hence the check is done here.
+ */
+ h = pool_header(cnat_main_db);
+ free_main = vec_len(h->free_indices) - 1;
+
+ h = pool_header(cnat_user_db);
+ free_user = vec_len(h->free_indices) - 1;
+
+ /*
+ * If either main_db or user_db does not have entries
+ * bail out, with appropriate error
+ */
+ if (PREDICT_FALSE(!(free_main && free_user))) {
+ u32 log_error;
+ if(free_main) {
+ info->error = CNAT_USER_DB_LIMIT;
+ log_error = CNAT_USER_DB_LIMIT_ERROR;
+ } else {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ log_error = CNAT_MAIN_DB_LIMIT_ERROR;
+ }
+ nat44_dslite_common_stats[dslite_id].in2out_drops_system_limit_reached ++;
+ DSLITE_DEBUG_INSIDE_ERR(info->error)
+ spp_printf(log_error, 0, 0);
+ return NULL;
+ }
+
+ if (PREDICT_FALSE(port_type == PORT_TYPE_STATIC)) {
+ rv = cnat_static_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_ANY,
+ port_pair_type,
+ ki->dk.ipv4_key.k.ipv4,
+ ki->dk.ipv4_key.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , NULL,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , my_vrfmap->ip_n_to_1
+
+ );
+ } else if (PREDICT_TRUE(port_type != PORT_TYPE_RTSP)) {
+ rv = cnat_dynamic_port_alloc_v2_bulk(pm,
+ PORT_ALLOC_ANY,
+ port_pair_type,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , NULL,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(dslite_entry_ptr->rseed_ip)
+ );
+ DSLITE_PRINTF(1,"NU:D PORT ALLOC..%x %u\n", ko.k.k.ipv4,
+ ko.k.k.port);
+
+ } else {
+ /*
+ * For RTSP, two translation entries are created,
+ * check if main db has space to accomodate two new entry
+ */
+ free_main = free_main - 1;
+
+ if (PREDICT_FALSE(!free_main)) {
+ info->error = CNAT_MAIN_DB_LIMIT;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_system_limit_reached ++;
+ DSLITE_DEBUG_INSIDE_ERR(CNAT_MAIN_DB_LIMIT)
+
+ return NULL;
+ } else {
+
+ rv = cnat_dynamic_port_alloc_rtsp_bulk(pm,
+ PORT_ALLOC_DIRECTED,
+ port_pair_type,
+ ki->dk.ipv4_key.k.port,
+ &my_index,
+ &(ko.k.k.ipv4),
+ &(ko.k.k.port),
+ STAT_PORT_RANGE_FROM_INST_PTR(dslite_entry_ptr)
+#ifndef NO_BULK_LOGGING
+ , NULL,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr),
+ &nfv9_log_req
+#endif
+ , &(dslite_entry_ptr->rseed_ip)
+ );
+ /* TODO: Add the port pair flag here */
+ }
+ }
+
+
+
+ if (PREDICT_FALSE(rv != CNAT_SUCCESS)) {
+ DSLITE_PRINTF(1,"NU:D_PORT_ALLOC port alloc error\n");
+ info->error = rv;
+ nat44_dslite_common_stats[dslite_id].in2out_drops_resource_depletion ++;
+ DSLITE_DEBUG_INSIDE_ERR(rv)
+ log_port_alloc_error(rv, &(ki->dk.ipv4_key));
+ return (NULL);
+ }
+ /*
+ * create entry in user db
+ */
+ udb = dslite_user_db_create_entry(&u_ki, my_index);
+ nat44_dslite_common_stats[dslite_id].num_subscribers++;
+ DSLITE_PRINTF(1,"UDB crete entry done..\n");
+#ifndef NO_BULK_LOGGING
+ if(PREDICT_TRUE(udb && (BULK_ALLOC_NOT_ATTEMPTED != nfv9_log_req))) {
+ cnat_update_bulk_range_cache(udb, ko.k.k.port,
+ BULKSIZE_FROM_VRFMAP(dslite_entry_ptr));
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ }
+
+ /*
+ * step 3:
+ * outside port is allocated for this src vrf/src ip addr
+ * 1)create a new entry in main db
+ * 2)setup cnat_out2in_hash key
+ * 3)setup cnat_in2out_hash key
+ */
+ db = dslite_create_main_db_entry_and_hash(ki, &ko, udb);
+ DSLITE_PRINTF(1,"dslite_create_main_db_entry_and_hash done..\n");
+ //db->vrfmap_index = my_vrfmap - cnat_map_by_vrf;
+ db->dslite_nat44_inst_id = dslite_id;
+ nat44_dslite_common_stats[dslite_id].active_translations++;
+ if (PREDICT_FALSE(port_type == PORT_TYPE_STATIC)) {
+ nat44_dslite_common_stats[dslite_id].num_static_translations++;
+ } else {
+ nat44_dslite_common_stats[dslite_id].num_dynamic_translations++;
+ }
+
+ dslite_translation_create_count++;
+
+ db->dst_ipv4 = dest_info->k.ipv4;
+ db->dst_port = dest_info->k.port;
+ if(PREDICT_TRUE(db->dst_ipv4 || db->dst_port)) {
+ /* for static fwding, let the nsessions remain zero */
+ db->nsessions++;
+ }
+
+ /*
+ * don't forget logging
+ * logging API is unconditional,
+ * logging configuration check is done inside the inline function
+ */
+ if(PREDICT_FALSE(nfv9_log_req != CACHE_ALLOC_NO_LOG_REQUIRED)) {
+ if(PREDICT_FALSE( dslite_entry_ptr->nf_logging_policy ==
+ SESSION_LOG_ENABLE)) {
+ if(PREDICT_TRUE(db->dst_ipv4 || db->dst_port)) {
+ cnat_nfv9_ds_lite_log_session_create(db,
+ dslite_entry_ptr,NULL);
+ }
+ } else {
+ cnat_nfv9_ds_lite_mapping_create(db,dslite_entry_ptr
+#ifndef NO_BULK_LOGGING
+ ,nfv9_log_req
+#endif
+ );
+ }
+ if(PREDICT_TRUE((dslite_entry_ptr->syslog_logging_policy != SESSION_LOG_ENABLE) ||
+ (db->dst_ipv4 || db->dst_port))) {
+ cnat_syslog_ds_lite_mapping_create(db,dslite_entry_ptr,NULL
+#ifndef NO_BULK_LOGGING
+ ,nfv9_log_req
+#endif
+ );
+ }
+ }
+
+#if 0
+ if (PREDICT_FALSE(port_pair_type == PORT_PAIR)) {
+ cnat_main_db_entry_t *db2 = 0;
+ dslite_db_key_bucket_t new_ki = *ki;
+ u64 a, b, c;
+
+ new_ki.k.k.port += 1;
+ ko.k.k.port += 1;
+
+ CNAT_V4_GET_HASH(new_ki.k.key64, new_ki.bucket,
+ CNAT_MAIN_HASH_MASK);
+
+ db2 = cnat_create_main_db_entry_and_hash(&new_ki, &ko, udb);
+
+ translation_create_count ++;
+ db2->dslite_nat44_inst_id = dslite_id;
+ db2->entry_expires = cnat_current_time;
+ db2->flags |= CNAT_DB_FLAG_ALG_ENTRY;
+ udb->ntranslations += 1;
+#ifndef NO_BULK_LOGGING
+ if(PREDICT_FALSE(nfv9_log_req == BULK_ALLOC_NOT_ATTEMPTED))
+ cnat_nfv9_log_mapping_create(db2, my_vrfmap, nfv9_log_req);
+#else
+ cnat_nfv9_log_mapping_create(db2, my_vrfmap);
+#endif
+ }
+#endif
+ return db;
+}
+#endif /* TOBE_PORTED */
+
+#if 0
+/* TOBE_PORTED */
+uword
+cnat_db_v2_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return 0;
+}
+VLIB_REGISTER_NODE (cnat_db_v2_node) = {
+ .function = cnat_db_v2_node_fn,
+ .name = "vcgn-db-v2",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_db_v2_error_strings),
+ .error_strings = cnat_db_v2_error_strings,
+
+ .n_next_nodes = CNAT_DB_V2_DROP,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_DB_V2_DROP] = "error-drop",
+ },
+};
+#endif
+void cnat_db_v2_init (void)
+{
+
+ u32 i, n;
+ cnat_timeout_db_entry_t * tdb __attribute__((unused));
+
+ cgse_nat_db_entry_t *comb_db __attribute__((unused));
+ cgse_nat_user_db_entry_t *comb_user __attribute__((unused));
+ cgse_nat_session_db_entry_t *comb_session __attribute__((unused));
+
+ n = CNAT_DB_SIZE*1.15; /* add 15% LB margin */
+
+ /*
+ * We also make it multiple of NUM_BITS_IN_UWORD for better
+ * DB scanning algorithm
+ */
+ if (n % NUM_BITS_IN_UWORD)
+ n += (NUM_BITS_IN_UWORD - (n % NUM_BITS_IN_UWORD));
+
+ pool_alloc(cgse_nat_db,n);
+ for(i=0; i< n; i++) {
+ pool_get(cgse_nat_db, comb_db);
+ }
+
+ for(i=0; i< n; i++) {
+ pool_put(cgse_nat_db, cgse_nat_db + i);
+ }
+
+ cnat_main_db = &cgse_nat_db->nat44_main_db;
+
+ /* For Sessions */
+ if(PLATFORM_DBL_SUPPORT) {
+ /* create session table for NAT44 and NAT64 itself */
+ printf("DBL Support exist %d\n", PLATFORM_DBL_SUPPORT);
+ n = CNAT_SESSION_DB_SIZE * 1.15; /* add 15% LB margin */
+ } else {
+ /* Create session table for NAT64 only */
+ printf("DBL Support Not exist\n");
+ n = NAT64_MAIN_DB_SIZE * 1.15; /* add 15% LB margin */
+ }
+
+ /*
+ * We also make it multiple of NUM_BITS_IN_UWORD for better
+ * DB scanning algorithm
+ */
+ if (n % NUM_BITS_IN_UWORD)
+ n += (NUM_BITS_IN_UWORD - (n % NUM_BITS_IN_UWORD));
+
+ pool_alloc(cgse_session_db,n);
+ for(i=0; i< n; i++) {
+ pool_get(cgse_session_db, comb_session);
+ }
+
+ for(i=0; i< n; i++) {
+ pool_put(cgse_session_db, cgse_session_db + i);
+ }
+
+ cnat_session_db = &cgse_session_db->nat44_session_db;
+
+ vec_validate(cnat_out2in_hash, CNAT_MAIN_HASH_MASK);
+ memset(cnat_out2in_hash, 0xff, CNAT_MAIN_HASH_SIZE*sizeof(index_slist_t));
+
+ vec_validate(cnat_in2out_hash, CNAT_MAIN_HASH_MASK);
+ memset(cnat_in2out_hash, 0xff, CNAT_MAIN_HASH_SIZE*sizeof(index_slist_t));
+
+ vec_validate(cnat_session_hash, CNAT_SESSION_HASH_MASK);
+ memset(cnat_session_hash, 0xff, CNAT_SESSION_HASH_SIZE*sizeof(index_slist_t));
+
+ n = CNAT_USER_DB_SIZE * 1.15; /* use hash size as db size for LB margin */
+ if (n % NUM_BITS_IN_UWORD)
+ n += (NUM_BITS_IN_UWORD - (n % NUM_BITS_IN_UWORD));
+
+ pool_alloc(cgse_user_db,n);
+ for(i=0; i< n; i++) {
+ pool_get(cgse_user_db, comb_user);
+ }
+
+ for(i=0; i< n; i++) {
+ pool_put(cgse_user_db, cgse_user_db + i);
+ }
+
+ cnat_user_db = &cgse_user_db->nat44_user_db;
+
+ vec_validate(cnat_user_hash, CNAT_USER_HASH_MASK);
+ memset(cnat_user_hash, 0xff, CNAT_USER_HASH_SIZE*sizeof(index_slist_t));
+
+ n = CNAT_TIMEOUT_HASH_SIZE; /* use hash size as db size for LB margin */
+ for(i=0; i< n; i++) {
+ pool_get(cnat_timeout_db, tdb);
+ }
+
+ for(i=0; i< n; i++) {
+ pool_put(cnat_timeout_db, cnat_timeout_db + i);
+ }
+
+ vec_validate(cnat_timeout_hash, CNAT_TIMEOUT_HASH_MASK);
+ memset(cnat_timeout_hash, 0xff, CNAT_TIMEOUT_HASH_SIZE*sizeof(index_slist_t));
+
+#ifdef TOBE_PORTED
+ for (i=0;i<CNAT_MAX_VRFMAP_ENTRIES; i++) {
+ svi_params_array[i].svi_type = CGSE_SVI_TYPE_INFRA;
+ }
+#endif
+ cnat_db_init_done = 1;
+ printf("CNAT DB init is successful\n");
+ return;
+ //return 0;
+}
diff --git a/vnet/vnet/vcgn/cnat_debug_msg_handler.c b/vnet/vnet/vcgn/cnat_debug_msg_handler.c
new file mode 100644
index 00000000000..9c84ac1e856
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_debug_msg_handler.c
@@ -0,0 +1,1780 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_debug_msg_handler.c - debug command
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include "cnat_cli.h"
+
+u32 global_debug_flag = CNAT_DEBUG_NONE;
+u16 debug_i_vrf = CNAT_DEBUG_NONE;
+u32 debug_i_flag = CNAT_DEBUG_NONE;
+u32 debug_i_addr_start = CNAT_DEBUG_NONE;
+u32 debug_i_addr_end = CNAT_DEBUG_NONE;
+
+u16 debug_o_vrf = CNAT_DEBUG_NONE;
+u32 debug_o_flag = CNAT_DEBUG_NONE;
+u32 debug_o_addr_start = CNAT_DEBUG_NONE;
+u32 debug_o_addr_end = CNAT_DEBUG_NONE;
+
+u32 udp_inside_checksum_disable = 0;
+u32 udp_outside_checksum_disable = 0;
+u32 udp_inside_packet_dump_enable = 0;
+u32 udp_outside_packet_dump_enable = 0;
+
+u32 tcp_logging_enable_flag = 0;
+
+u32 icmp_debug_flag = 0;
+u32 frag_debug_flag = 0;
+
+u32 nfv9_logging_debug_flag = 0;
+u32 syslog_debug_flag = 0;
+
+u32 summary_stats_debug_flag = 0;
+
+/*
+ * By defaut we set the config debug level to 1
+ */
+u32 config_debug_level = 1;
+
+#ifdef TOBE_PORTED
+extern void show_bulk_port_stats();
+extern void clear_bulk_port_stats();
+extern void show_bulk_port_allocation(u16 in_vrfid, u32 inside_ip);
+extern void set_bulk_size_to_all_vrfs(int bulk_size);
+
+u32 *cnat_debug_addr_list;
+
+extern int global_pd_dbg_lvl;
+extern int global_pi_dbg_lvl;
+extern int global_l2_dbg_lvl;
+extern u32 cnat_pptp_debug_flag;
+extern u32 cnat_pcp_debug_flag;
+
+void spp_api_cnat_get_cgn_db_summary
+(spp_api_cnat_generic_command_request_t *);
+
+void spp_api_cnat_v4_debug_dummy_t_handler
+(spp_api_cnat_v4_debug_dummy_t *mp)
+{
+ u32 arr[] = { DEBUG_DUMMY };
+ spp_printf(CNAT_DUMMY_HANDLER_HIT, 1, arr);
+ if(global_pd_dbg_lvl) {
+ PLATFORM_DEBUG_PRINT("\n invalid debug command received: message id is 0\n");
+ }
+ mp->rc = CNAT_ERR_INVALID_MSG_ID;
+
+}
+
+void spp_api_cnat_v4_debug_dummy_max_t_handler
+(spp_api_cnat_v4_debug_dummy_max_t *mp)
+{
+ u32 arr[] = { DEBUG_DUMMY_MAX };
+ spp_printf(CNAT_DUMMY_HANDLER_HIT, 1, arr);
+ if(global_pd_dbg_lvl) {
+ PLATFORM_DEBUG_PRINT("\n invalid debug command received: message id is out of range\n");
+ }
+ mp->rc = CNAT_ERR_INVALID_MSG_ID;
+
+}
+
+
+void spp_api_cnat_v4_debug_global_t_handler
+(spp_api_cnat_v4_debug_global_t *mp)
+{
+ if ((mp->debug_flag == CNAT_DEBUG_GLOBAL_ERR) ||
+ (mp->debug_flag == CNAT_DEBUG_GLOBAL_ALL) ||
+ (mp->debug_flag == CNAT_DEBUG_NONE)) {
+ mp->rc = CNAT_SUCCESS;
+ global_debug_flag = mp->debug_flag;
+ return;
+ }
+
+ mp->rc = CNAT_ERR_PARSER;
+ if(global_pd_dbg_lvl) {
+ PLATFORM_DEBUG_PRINT("invalid global debug flag %x\n",
+ mp->debug_flag);
+ }
+ return;
+}
+
+void spp_node_print_cnat_counters()
+{
+ if (cnat_global_counters.nfv9_downstream_constipation_count) {
+ PLATFORM_DEBUG_PRINT("\nNF downstream constipation count: %llu\n",
+ cnat_global_counters.nfv9_downstream_constipation_count);
+ }
+
+ if (xlat_global_counters.v4_to_v6_frag_invalid_uidb_drop_count ||
+ xlat_global_counters.v6_to_v4_frag_invalid_uidb_drop_count ||
+ xlat_global_counters.v4_to_v6_icmp_invalid_uidb_drop_count ||
+ xlat_global_counters.v6_to_v4_icmp_invalid_uidb_drop_count ||
+ xlat_global_counters.v4_to_v6_tcp_invalid_uidb_drop_count ||
+ xlat_global_counters.v6_to_v4_tcp_invalid_uidb_drop_count ||
+ xlat_global_counters.v4_to_v6_udp_invalid_uidb_drop_count ||
+ xlat_global_counters.v6_to_v4_udp_invalid_uidb_drop_count ||
+ xlat_global_counters.v4_to_v6_udp_crc_zero_invalid_uidb_drop_count) {
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v4_to_v6 frag invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v4_to_v6_frag_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v6_to_v4 frag invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v6_to_v4_frag_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v4_to_v6 icmp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v4_to_v6_icmp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v6_to_v4 icmp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v6_to_v4_icmp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v4_to_v6 tcp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v4_to_v6_tcp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v6_to_v4 tcp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v6_to_v4_tcp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v4_to_v6 udp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v4_to_v6_udp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v6_to_v4 udp invalid uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v6_to_v4_udp_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\nMy_instance %d: v4_to_v6 udp crc0 invld uidb drop count %lld",
+ my_instance_number,
+ xlat_global_counters.v4_to_v6_udp_crc_zero_invalid_uidb_drop_count);
+
+ PLATFORM_DEBUG_PRINT("\n");
+ }
+
+
+}
+
+void spp_log_p2mp_req(spp_api_cnat_p2mp_debug_request_t *mp)
+{
+ u8 i = 0;
+ u32 num_rec = spp_net_to_host_byte_order_32(&mp->param[i++]);
+ u32 err_c_num_args;
+
+ while (num_rec--) {
+ u8 j = 0;
+ u16 err_c;
+ u16 num_args;
+ u32 argv[32];
+
+ err_c_num_args = spp_net_to_host_byte_order_32(&mp->param[i++]);
+ err_c = (err_c_num_args >> 16) & 0xFFFF;
+ num_args = err_c_num_args & 0xFFFF;
+
+ num_args = (num_args <= 32) ? num_args : 32;
+ while (j < num_args) {
+ argv[j++] = spp_net_to_host_byte_order_32(&mp->param[i++]);
+ }
+
+ i += ((num_args - 32) > 0) ? (num_args - 32) : 0;
+ spp_printf(err_c, num_args, argv);
+ }
+}
+
+void nat64_debug_addr_pool_add_del()
+{
+ cnat_portmap_v2_t *my_pm = NULL;
+ cnat_portmap_v2_t *pm = NULL;
+ u32 len, i, pm_len;
+
+ PLATFORM_DEBUG_PRINT("\n sizeof port_map =%d\n", sizeof( cnat_portmap_v2_t));
+ len = 10;
+ PLATFORM_DEBUG_PRINT("\n adding 10 entries in vector 1-10\n ");
+ vec_add2(pm, my_pm, len);
+ pm = my_pm;
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ for(i=0;i<len;i++){
+ my_pm->ipv4_address = i+1;
+ my_pm++;
+ }
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+
+ pm_len = vec_len(pm);
+ PLATFORM_DEBUG_PRINT("\n printing vector contents : vec_len = %d \n", pm_len);
+ my_pm = pm;
+ for(i=0;i<pm_len ; i++)
+ {
+ PLATFORM_DEBUG_PRINT(" %d ,",my_pm->ipv4_address);
+ my_pm++;
+ }
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+
+ PLATFORM_DEBUG_PRINT("\n adding 5 entries in vector 11-15\n ");
+ len = 5;
+ vec_add2(pm, my_pm, len);
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ for(i=0;i<len;i++) {
+ my_pm->ipv4_address = 11+i;
+ my_pm++;
+ }
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ pm_len = vec_len(pm);
+ PLATFORM_DEBUG_PRINT("\n printing vector contents : vec_len = %d \n", pm_len);
+ my_pm = pm;
+ for(i=0;i<pm_len ; i++)
+ {
+ PLATFORM_DEBUG_PRINT(" %d ,",my_pm->ipv4_address);
+ my_pm++;
+ }
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+
+ PLATFORM_DEBUG_PRINT("\n adding 6 entries in vector 16-21\n ");
+ len = 6;
+ vec_add2(pm, my_pm, len);
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ for(i=0;i<len;i++) {
+ my_pm->ipv4_address = 16+i;
+ my_pm++;
+ }
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ pm_len = vec_len(pm);
+ PLATFORM_DEBUG_PRINT("\n printing vector contents : vec_len = %d \n", pm_len);
+ my_pm = pm;
+ for(i=0;i<pm_len ; i++)
+ {
+ PLATFORM_DEBUG_PRINT(" %d ,",my_pm->ipv4_address);
+ my_pm++;
+ }
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ PLATFORM_DEBUG_PRINT("\nDeleting 7 entries starting from entry value=8\n");
+ pm_len = vec_len(pm);
+ my_pm = pm;
+ PLATFORM_DEBUG_PRINT(" pm_len =%d\n", pm_len);
+ for(i=0;i<pm_len;i++)
+ {
+ if(my_pm->ipv4_address == 8){
+ PLATFORM_DEBUG_PRINT("\n match found brraeaking..\n");
+ break;
+ }
+ my_pm++;
+ }
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p i= %d\n", pm, my_pm, i);
+// vec_delete(pm, 7, my_pm);
+ vec_delete(pm, 7, i);
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+
+ PLATFORM_DEBUG_PRINT(" printing entries aftr deletion from 8-14\n");
+ pm_len = vec_len(pm);
+ PLATFORM_DEBUG_PRINT("\n printing vector contents : vec_len = %d \n", pm_len);
+ my_pm = pm;
+ for(i=0;i<pm_len ; i++)
+ {
+ PLATFORM_DEBUG_PRINT(" %d ,",my_pm->ipv4_address);
+ my_pm++;
+ }
+
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+
+ PLATFORM_DEBUG_PRINT("\nadding deleted items again 8-14\n");
+ len =7;
+ vec_add2(pm, my_pm, len);
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ for(i=0;i<len;i++) {
+ my_pm->ipv4_address = 8+i;
+ my_pm++;
+ }
+
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ pm_len = vec_len(pm);
+ PLATFORM_DEBUG_PRINT("\n printing vector contents : vec_len = %d \n", pm_len);
+ my_pm = pm;
+ for(i=0;i<pm_len ; i++)
+ {
+ PLATFORM_DEBUG_PRINT(" %d ,",my_pm->ipv4_address);
+ my_pm++;
+ }
+ PLATFORM_DEBUG_PRINT(" pm =%p , my_pm = %p\n", pm, my_pm);
+ PLATFORM_DEBUG_PRINT("\n");
+}
+
+
+void uidb_mapping_dump_timeout() {
+
+ u32 i;
+
+ PLATFORM_DEBUG_PRINT("\nCGSE uidb mapping table \n");
+ for(i = 0;i < 30;i++) {
+ PLATFORM_DEBUG_PRINT("%d ",*(cgse_uidb_index_cgse_id_mapping_ptr + i));
+ }
+
+}
+
+void nat64_debug_dump_info(u32 debug_value)
+{
+
+ switch(debug_value) {
+
+ case 1 :
+ bib_add_v6_entry1();
+ break;
+
+ case 2 :
+ bib_add_v6_entry2();
+ break;
+
+ case 3 :
+ bib_add_v6_entry1_new();
+ break;
+
+ case 4 :
+ bib_add_v6_entry1_new_static();
+ break;
+
+ case 5 :
+ bib_add_v6_entry3();
+ break;
+
+ case 6 :
+ bib_add_v6_entry_new2();
+ break;
+
+ case 7 :
+ nat64_fill_table_entry();
+ break;
+
+ case 10 :
+ nat64_db_dump_main();
+ break;
+
+ case 11 :
+ nat64_db_dump_user();
+ break;
+
+ case 12 :
+ nat64_db_dump_session();
+ break;
+
+ case 13 :
+ nat64_dump_table();
+ break;
+
+ case 14 :
+ bib_del_v6_entry1_static();
+ break;
+
+ case 15 :
+ nat64_debug_addr_pool_add_del();
+ break;
+
+ case 16 :
+ nat64_db_dump_timeout(0);
+ break;
+
+ case 17 :
+ uidb_mapping_dump_timeout();
+ break;
+
+ default : break;
+ }
+}
+
+
+void cnat_debug_flags_set (spp_api_cnat_p2mp_debug_request_t *mp)
+{
+ u32 debug_variable = spp_net_to_host_byte_order_32(&mp->param[0]);
+ u32 debug_value = spp_net_to_host_byte_order_32(&mp->param[1]);
+
+ cnat_key_t t_key;
+
+ switch (debug_variable) {
+
+ case CNAT_DEBUG_FLAG_UDP_INSIDE_CHECKSUM_DISABLE:
+ udp_inside_checksum_disable = debug_value;
+ PLATFORM_DEBUG_PRINT("\nudp_inside_checksum_disable set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_UDP_OUTSIDE_CHECKSUM_DISABLE:
+ udp_outside_checksum_disable = debug_value;
+ PLATFORM_DEBUG_PRINT("\nudp_outside_checksum_disable set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_UDP_OUTSIDE_PKT_DUMP_ENABLE:
+ udp_outside_packet_dump_enable = debug_value;
+ PLATFORM_DEBUG_PRINT("\nudp_outside_packet_dump_enable set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_UDP_INSIDE_PKT_DUMP_ENABLE:
+ udp_inside_packet_dump_enable = debug_value;
+ PLATFORM_DEBUG_PRINT("\nudp_inside_packet_dump_enable set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_ICMP_PKT_DUMP_ENABLE:
+ icmp_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nicmp_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_FRAG_PKT_DUMP_ENABLE:
+ frag_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nfrag_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_XLAT_CONFIG_DEBUG_ENABLE:
+ xlat_config_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nxlat_config_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_NAT64_CONFIG_DEBUG_ENABLE:
+ nat64_config_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nnat64_config_debug_level set to %d\n", debug_value);
+ nat64_debug_dump_info(debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_NAT64_DATA_PATH_DEBUG_ENABLE:
+ nat64_data_path_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nnat64_data_path_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_DSLITE_CONFIG_DEBUG_ENABLE:
+ ds_lite_config_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nds_lite_config_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_XLAT_DATA_PATH_DEBUG_ENABLE:
+ xlat_data_path_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nxlat_data_path_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_CONFIG_DEBUG_ENABLE:
+ config_debug_level = debug_value;
+
+ PLATFORM_DEBUG_PRINT("\nconfig_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_CONFIG_PPTP_ENABLE:
+ cnat_pptp_debug_flag = debug_value;
+
+ if(debug_value == 0) {
+ pptp_dump_counters();
+ }
+
+ PLATFORM_DEBUG_PRINT("\ncnat_pptp_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_CONFIG_PCP_ENABLE:
+ cnat_pcp_debug_flag = debug_value;
+
+ if(debug_value == 0) {
+ pcp_dump_counters();
+ }
+ PLATFORM_DEBUG_PRINT("\ncnat_pcp_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_GLOBAL_DEBUG_ALL_ENABLE:
+ global_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nglobal_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_SUMMARY_STATS_DEBUG_ENABLE:
+ summary_stats_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nsummary_stats_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_SHOW_DEBUG_ENABLE:
+ show_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nshow_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_TCP_LOGGING_ENABLE:
+ tcp_debug_logging_enable_disable(debug_value);
+ break;
+ case CNAT_DEBUG_FLAG_V6RD_DATA_PATH_DEBUG_ENABLE:
+ v6rd_data_path_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nv6rd_data_path_debug_level set to %d\n", debug_value);
+ break;
+ case CNAT_DEBUG_FLAG_V6RD_CONFIG_DEBUG_ENABLE:
+ v6rd_config_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nv6rd_config_debug_level set to %d\n", debug_value);
+ break;
+ case CNAT_DEBUG_FLAG_V6RD_DEFRAG_DEBUG_ENABLE:
+ /* set debug atleast to 1, so that critical errors are always
+ * enabled
+ */
+ v6rd_defrag_debug_level = debug_value ? debug_value : 1;
+ PLATFORM_DEBUG_PRINT("\nv6rd_config_debug_level set to %d\n", debug_value);
+ break;
+
+
+ case CNAT_DEBUG_SET_STATIC_PORT_RANGE:
+ PLATFORM_DEBUG_PRINT("\nChange Static Port Range from %d --> %d\n",
+ cnat_static_port_range, debug_value);
+ cnat_static_port_range = debug_value;
+ break;
+
+ case CNAT_DEBUG_FLAG_DSLITE_DP_ENABLE:
+ PLATFORM_DEBUG_PRINT("\n Changing dslite debug flag from %d --> %d\n",
+ dslite_debug_level, debug_value);
+ dslite_debug_level = debug_value;
+ break;
+
+ case CNAT_DEBUG_FLAG_NFV9_LOGGING_DUMP_ENABLE:
+ nfv9_logging_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nnfv9_logging_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_SYSLOG_LOGGING_DUMP_ENABLE:
+ syslog_debug_flag = debug_value;
+ PLATFORM_DEBUG_PRINT("\nsyslog_debug_flag set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_MAPE_CONFIG_DEBUG_ENABLE:
+ mape_config_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nmape_config_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAG_MAPE_DATA_PATH_DEBUG_ENABLE:
+ mape_data_path_debug_level = debug_value;
+ PLATFORM_DEBUG_PRINT("\nmape_data_path_debug_level set to %d\n", debug_value);
+ break;
+
+ case CNAT_DEBUG_FLAGS_DUMP:
+ default:
+ {
+ PLATFORM_DEBUG_PRINT("\nCurrent values of Debug Variables\n");
+ PLATFORM_DEBUG_PRINT("\nTo modify an item chose its index and provide the value\n");
+ PLATFORM_DEBUG_PRINT("\n%d: udp_inside_checksum_disable %d\n",
+ CNAT_DEBUG_FLAG_UDP_INSIDE_CHECKSUM_DISABLE,
+ udp_inside_checksum_disable);
+ PLATFORM_DEBUG_PRINT("%d: udp_outside_checksum_disable %d\n",
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_CHECKSUM_DISABLE,
+ udp_outside_checksum_disable);
+ PLATFORM_DEBUG_PRINT("%d: udp_inside_packet_dump_enable %d\n",
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_PKT_DUMP_ENABLE,
+ udp_inside_packet_dump_enable);
+ PLATFORM_DEBUG_PRINT("%d: udp_outside_packet_dump_enable %d\n",
+ CNAT_DEBUG_FLAG_UDP_INSIDE_PKT_DUMP_ENABLE,
+ udp_outside_packet_dump_enable);
+ PLATFORM_DEBUG_PRINT("%d: icmp_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_ICMP_PKT_DUMP_ENABLE,
+ icmp_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: frag_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_FRAG_PKT_DUMP_ENABLE,
+ frag_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: config_debug_level %d\n",
+ CNAT_DEBUG_FLAG_CONFIG_DEBUG_ENABLE,
+ config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: global_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_GLOBAL_DEBUG_ALL_ENABLE,
+ global_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: summary_stats_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_SUMMARY_STATS_DEBUG_ENABLE,
+ summary_stats_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: show_debug_level %d\n",
+ CNAT_DEBUG_FLAG_SHOW_DEBUG_ENABLE,
+ show_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: xlat_config_debug_level %d\n",
+ CNAT_DEBUG_FLAG_XLAT_CONFIG_DEBUG_ENABLE,
+ xlat_config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: xlat_data_path_debug_level %d\n",
+ CNAT_DEBUG_FLAG_XLAT_DATA_PATH_DEBUG_ENABLE,
+ xlat_data_path_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: tcp_logging_enable_flag %d\n",
+ CNAT_DEBUG_FLAG_TCP_LOGGING_ENABLE,
+ tcp_logging_enable_flag);
+ PLATFORM_DEBUG_PRINT(" tcp_logging_enable_options DISABLE %d, ENABLE %d, PKT_DUMP %d, SUMMARY_DUMP %d\n",
+ TCP_LOGGING_DISABLE, TCP_LOGGING_ENABLE,
+ TCP_LOGGING_PACKET_DUMP, TCP_LOGGING_SUMMARY_DUMP);
+ PLATFORM_DEBUG_PRINT("%d: nfv9_logging_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_NFV9_LOGGING_DUMP_ENABLE,
+ nfv9_logging_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: syslog_debug_flag %d\n",
+ CNAT_DEBUG_FLAG_SYSLOG_LOGGING_DUMP_ENABLE,
+ syslog_debug_flag);
+ PLATFORM_DEBUG_PRINT("%d: cnat_static_port_range %d\n",
+ CNAT_DEBUG_SET_STATIC_PORT_RANGE,
+ cnat_static_port_range);
+ PLATFORM_DEBUG_PRINT("%d: v6rd_data_path_debug_level %d\n",
+ CNAT_DEBUG_FLAG_V6RD_DATA_PATH_DEBUG_ENABLE,
+ v6rd_data_path_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: v6rd_config_debug_level %d\n",
+ CNAT_DEBUG_FLAG_V6RD_CONFIG_DEBUG_ENABLE,
+ v6rd_config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: v6rd_defrag_debug_level %d\n",
+ CNAT_DEBUG_FLAG_V6RD_DEFRAG_DEBUG_ENABLE,
+ v6rd_defrag_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: nat64_stful_debug %d\n",
+ CNAT_DEBUG_FLAG_NAT64_CONFIG_DEBUG_ENABLE,
+ nat64_config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: nat64_data_path_debug_level %d\n",
+ CNAT_DEBUG_FLAG_NAT64_DATA_PATH_DEBUG_ENABLE,
+ nat64_data_path_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: dslite_debug_level %d\n",
+ CNAT_DEBUG_FLAG_DSLITE_DP_ENABLE,
+ dslite_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: ds_lite_config_debug_level %d\n",
+ CNAT_DEBUG_FLAG_DSLITE_CONFIG_DEBUG_ENABLE,
+ ds_lite_config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: mape_config_debug_level %d\n",
+ CNAT_DEBUG_FLAG_MAPE_CONFIG_DEBUG_ENABLE,
+ mape_config_debug_level);
+ PLATFORM_DEBUG_PRINT("%d: mape_data_path_debug_level %d\n",
+ CNAT_DEBUG_FLAG_MAPE_DATA_PATH_DEBUG_ENABLE,
+ mape_data_path_debug_level);
+ }
+ break;
+ }
+}
+
+extern void dump_cnat_frag_stats(void);
+
+void spp_api_cnat_p2mp_debug_request_t_handler
+(spp_api_cnat_p2mp_debug_request_t *mp)
+{
+ u16 command_type;
+
+/*
+ if (mp->core_num != my_instance_number) {
+ mp->rc = CNAT_NOT_THIS_CORE;
+ return;
+ }
+*/
+
+ command_type = spp_net_to_host_byte_order_16(&mp->dump_type);
+ PLATFORM_DEBUG_PRINT("-->> Core%d: Received debug msg ... cmd type: %d\n",
+ my_instance_number, command_type);
+
+ switch (command_type) {
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_POLICY:
+ PLATFORM_DEBUG_PRINT("Core%d: policy\n", my_instance_number);
+ cnat_db_dump_policy();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_MAIN_DB:
+ PLATFORM_DEBUG_PRINT("Core%d: Main db\n", my_instance_number);
+ cnat_db_dump_main();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_MAIN_DB_SUMMARY:
+ PLATFORM_DEBUG_PRINT("Core%d: Main db Summary\n", my_instance_number);
+ cnat_db_dump_main_summary();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_USER_DB:
+ PLATFORM_DEBUG_PRINT("Core%d: User db\n", my_instance_number);
+ cnat_db_dump_user();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_USER_DB_SUMMARY:
+ PLATFORM_DEBUG_PRINT("Core%d: User db Summary\n", my_instance_number);
+ cnat_db_dump_user_summary();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_HASHES_DB:
+ PLATFORM_DEBUG_PRINT("Core%d: Hashes db\n", my_instance_number);
+ cnat_db_dump_hashes();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_VRF_MAP:
+ PLATFORM_DEBUG_PRINT("Core%d: Vrf map \n", my_instance_number);
+ cnat_db_dump_portmaps();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_SUMMARY_DB:
+ PLATFORM_DEBUG_PRINT("Core%d: dump summary DB \n", my_instance_number);
+ cnat_db_summary();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_STATS:
+ PLATFORM_DEBUG_PRINT("Core%d: dump stats \n", my_instance_number);
+ spp_node_print_stats(1, NULL);
+ break;
+
+ /* Currently does same as clear node ctr, may change */
+ case CNAT_DEBUG_GENERIC_COMMAND_CLEAR_STATS:
+ PLATFORM_DEBUG_PRINT("Core%d: clear stats \n", my_instance_number);
+ spp_node_clear_stats();
+ break;
+
+ case CNAT_DEBUG_SPP_LOG:
+ PLATFORM_DEBUG_PRINT("Core%d: SPP LOG \n", my_instance_number);
+ spp_log_p2mp_req(mp);
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_NODE_COUNTER:
+ PLATFORM_DEBUG_PRINT("Core%d: NODE Counter dump \n", my_instance_number);
+ spp_node_print_counters();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_CLEAR_NODE_COUNTER:
+ PLATFORM_DEBUG_PRINT("Core%d: clear node counter \n", my_instance_number);
+ spp_node_clear_stats();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_CNAT_COUNTER:
+ PLATFORM_DEBUG_PRINT("Core%d: CNAT Counter dump \n", my_instance_number);
+ spp_node_print_cnat_counters();
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_VA:
+ PLATFORM_DEBUG_PRINT("Core%d: VA dump \n", my_instance_number);
+ {
+ int argc = 1;
+ u32 arg[2] = {spp_net_to_host_byte_order_32(&mp->param[0]), 0};
+
+ cnat_va_dump(argc, arg);
+ }
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_SHOW_CONFIG:
+ PLATFORM_DEBUG_PRINT("Core%d: Show config dump \n", my_instance_number);
+ {
+ int argc = 0;
+ unsigned long arg[3];
+
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[0])) {
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[1])) {
+ ;
+ } else {
+ argc--;
+ }
+ }
+
+ cnat_show_cdb_command_v2(argc, arg);
+/*
+ xlat_show_config();
+ cnat_alg_show();
+*/
+ v6rd_show_config();
+ dslite_show_config();
+ nat64_dump_table();
+ mape_show_config();
+ }
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_SHOW_NFV9:
+ PLATFORM_DEBUG_PRINT("Core%d: NFv9 dump \n", my_instance_number);
+ #if 0 /* Currently not calling this */
+ cnat_nfv9_show_cmd();
+ #endif
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_SHOW_IVRF:
+ PLATFORM_DEBUG_PRINT("Core%d: IVRF dump \n", my_instance_number);
+ {
+ int argc = 0;
+ unsigned long arg[3];
+
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[0])) {
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[1])) {
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[2])) {
+ ;
+ } else {
+ argc--;
+ }
+ } else {
+ argc--;
+ }
+ }
+
+
+ PLATFORM_DEBUG_PRINT("VRF: %d \n", spp_net_to_host_byte_order_32(&mp->param[0]));
+ PLATFORM_DEBUG_PRINT("2nd arg: %d \n",
+ spp_net_to_host_byte_order_32(&mp->param[1]));
+
+ cnat_show_ivrf_command_v2(argc, arg);
+ }
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_SHOW_OVRF:
+ PLATFORM_DEBUG_PRINT("Core%d: OVRF dump \n", my_instance_number);
+ {
+ int argc = 0;
+ unsigned long arg[3];
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[0])) {
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[1])) {
+ if (arg[argc++] = spp_net_to_host_byte_order_32(&mp->param[2])) {
+ ;
+ } else {
+ argc--;
+ }
+ } else {
+ argc--;
+ }
+ }
+
+ PLATFORM_DEBUG_PRINT("VRF: %d \n", spp_net_to_host_byte_order_32(&mp->param[0]));
+ PLATFORM_DEBUG_PRINT("2nd arg: %d \n",
+ spp_net_to_host_byte_order_32(&mp->param[1]));
+
+ cnat_show_ovrf_command_v2(argc, arg);
+ }
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DEBUG_OPTIONS:
+ PLATFORM_DEBUG_PRINT("Core%d: Debug option dump \n", my_instance_number);
+ {
+ global_pd_dbg_lvl = 0;
+ global_pi_dbg_lvl = 0;
+ global_l2_dbg_lvl = 0;
+
+ global_pd_dbg_lvl =
+ spp_net_to_host_byte_order_32(&mp->param[0]);
+ global_pi_dbg_lvl =
+ spp_net_to_host_byte_order_32(&mp->param[1]);
+ global_l2_dbg_lvl =
+ spp_net_to_host_byte_order_32(&mp->param[2]);
+
+ PLATFORM_DEBUG_PRINT("global_pd_dbg_lvl: %d, global_pi_dbg_lvl: %d, global_l2_dbg_lvl: %d\n",
+ global_pd_dbg_lvl, global_pi_dbg_lvl, global_l2_dbg_lvl);
+ }
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DUMP_DEBUG_LEVELS:
+ PLATFORM_DEBUG_PRINT("Core%d: PD Debug level: %d \n", my_instance_number, global_pd_dbg_lvl);
+ PLATFORM_DEBUG_PRINT("Core%d: PI Debug level: %d \n", my_instance_number, global_pi_dbg_lvl);
+ PLATFORM_DEBUG_PRINT("Core%d: L2 Debug level: %d \n", my_instance_number, global_l2_dbg_lvl);
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DEBUG_FLAGS:
+ PLATFORM_DEBUG_PRINT("Core%d: Debug flags \n", my_instance_number);
+ cnat_debug_flags_set(mp);
+ break;
+
+ case CNAT_READ_TEMP_SENSORS:
+ PLATFORM_INIT_TEMP_SENSORS();
+ PLATFORM_READ_CPU_SENSORS(TEMPERATURE_SENSOR_TEST_MODE);
+ break;
+
+ case CNAT_BLOCK_OCTEON_SENSOR_READ:
+
+ PLATFORM_SET_TEMP_READ_BLOCK(temperature_read_blocked , mp);
+#ifdef TARGET_RODDICK
+ temperature_read_blocked =
+ spp_net_to_host_byte_order_32(&mp->param[0]);
+#endif
+ break;
+
+ case CNAT_DEBUG_TIMEOUT_DB_SUMMARY:
+ cnat_db_dump_timeout();
+ break;
+
+ /* This option has to be removed later */
+ case CNAT_DEBUG_SET_BULK_SIZE:
+ PLATFORM_DEBUG_PRINT("\nSetting bulk size to %d\n",
+ spp_net_to_host_byte_order_32(&mp->param[0]));
+ set_bulk_size_to_all_vrfs(
+ spp_net_to_host_byte_order_32(&mp->param[0]));
+ break;
+
+ case CNAT_DEBUG_SHOW_BULK_STAT:
+ show_bulk_port_stats();
+ break;
+
+ case CNAT_DEBUG_CLEAR_BULK_STAT:
+ clear_bulk_port_stats();
+ break;
+
+ case CNAT_DEBUG_SHOW_BULK_ALLOC:
+ {
+ u16 in_vrfid = spp_net_to_host_byte_order_32(&mp->param[0]);
+ u32 inside_ip = spp_net_to_host_byte_order_32(&mp->param[1]);
+ show_bulk_port_allocation(in_vrfid, inside_ip);
+ }
+ break;
+
+ case CNAT_DEBUG_NAT44_IN2OUT_FRAG_STATS:
+ dump_cnat_frag_stats();
+ break;
+
+ default:
+ mp->rc = CNAT_ERR_INVALID_MSG_ID;
+ break;
+ }
+
+ mp->rc = CNAT_SUCCESS;
+ return;
+}
+
+
+void spp_api_cnat_v4_debug_in2out_private_addr_t_handler
+(spp_api_cnat_v4_debug_in2out_private_addr_t *mp)
+{
+ u16 i_vrf;
+ u32 debug_flag;
+ u32 start_addr, end_addr;
+
+
+ start_addr =
+ spp_net_to_host_byte_order_32(&mp->start_addr);
+ end_addr =
+ spp_net_to_host_byte_order_32(&mp->end_addr);
+ i_vrf =
+ spp_net_to_host_byte_order_16(&mp->i_vrf);
+ debug_flag =
+ spp_net_to_host_byte_order_32(&mp->debug_flag);
+
+ if ((i_vrf > MAX_UIDX) || (start_addr > end_addr) ||
+ ((debug_flag != CNAT_DEBUG_NONE) &&
+ ((debug_flag & CNAT_DEBUG_ALL) == CNAT_DEBUG_NONE))) {
+ mp->rc = CNAT_ERR_PARSER;
+ PLATFORM_DEBUG_PRINT("invalid debug ivrf 0x%x flag 0x%x "
+ "start addr 0x%x end addr 0x%x\n",
+ i_vrf, debug_flag,
+ start_addr, end_addr);
+ return;
+ }
+
+ PLATFORM_DEBUG_PRINT("debug ivrf 0x%x flag 0x%x "
+ "start addr 0x%x end addr 0x%x\n",
+ i_vrf, debug_flag,
+ start_addr, end_addr);
+
+ mp->rc = CNAT_SUCCESS;
+ debug_i_vrf = i_vrf;
+ debug_i_flag = debug_flag;
+ debug_i_addr_start = start_addr;
+ debug_i_addr_end = end_addr;
+
+}
+
+void spp_api_cnat_v4_debug_out2in_public_addr_t_handler
+(spp_api_cnat_v4_debug_out2in_public_addr_t *mp)
+{
+ u16 o_vrf;
+ u32 debug_flag;
+ u32 start_addr, end_addr;
+
+ start_addr =
+ spp_net_to_host_byte_order_32(&mp->start_addr);
+ end_addr =
+ spp_net_to_host_byte_order_32(&mp->end_addr);
+ o_vrf =
+ spp_net_to_host_byte_order_16(&mp->o_vrf);
+ debug_flag =
+ spp_net_to_host_byte_order_32(&mp->debug_flag);
+
+ if ((o_vrf > MAX_UIDX) || (start_addr > end_addr) ||
+ ((debug_flag != CNAT_DEBUG_NONE) &&
+ ((debug_flag & CNAT_DEBUG_ALL) == CNAT_DEBUG_NONE))) {
+ mp->rc = CNAT_ERR_PARSER;
+ PLATFORM_DEBUG_PRINT("invalid debug ovrf 0x%x flag 0x%x "
+ "start addr 0x%x end addr 0x%x\n",
+ o_vrf, debug_flag,
+ start_addr, end_addr);
+ return;
+ }
+
+ mp->rc = CNAT_SUCCESS;
+ debug_o_vrf = o_vrf;
+ debug_o_flag = debug_flag;
+ debug_o_addr_start = start_addr;
+ debug_o_addr_end = end_addr;
+
+ PLATFORM_DEBUG_PRINT(" o2i debug currently is not supported\n");
+}
+
+void nat64_reset_session_expiry(nat64_bib_entry_t *db)
+{
+ NAT64_STFUL_DEBUG_PRINT(3, " invoking nat64_clean_bib_db_entry\n " );
+ nat64_clean_bib_db_entry(db);
+ NAT64_STFUL_DEBUG_PRINT(3, "done with clean_bib_db_entry\n " );
+}
+
+void spp_api_nat64_clear_db_request_t_handler
+(spp_api_nat64_clear_db_request_t *mp)
+{
+ u16 port, proto, flag;
+ u32 index;
+ u32 i;
+ nat64_bib_entry_t* db;
+ nat64_v6_key_t ki;
+ nat64_table_entry_t *my_nat64_table_db_ptr;
+ u16 nat64_id;
+
+ NAT64_STFUL_FUNC_ENTER;
+ NAT64_STFUL_DEBUG_DUMP_MSG(mp);
+
+ nat64_id = spp_net_to_host_byte_order_16(&mp->nat64_id);
+ my_nat64_table_db_ptr = nat64_table_ptr + nat64_id;
+
+ port = spp_net_to_host_byte_order_16(&mp->port_num);
+ proto = mp->protocol;
+
+ ki.vrf = nat64_id;
+ ki.vrf |= ((u16)proto << CNAT_PRO_SHIFT);
+
+ for(i =0 ; i< 4 ; i++)
+ ki.ipv6[i] = spp_net_to_host_byte_order_32(&mp->ip_addr[i]);
+
+ ki.port = port;
+
+ flag = mp->flags;
+
+ mp->rc = CNAT_SUCCESS;
+
+ NAT64_STFUL_DEBUG_PRINT(3, "\n Nat64_id = %d, port =%d, \
+ proto =%d, flags=0x%08X",\
+ nat64_id, port, proto, flag);
+
+ NAT64_STFUL_DEBUG_PRINT(3, "\n IPv6 Addr = %08X : %08X: %08X: %08X",\
+ ki.ipv6[0], ki.ipv6[1], ki.ipv6[2], ki.ipv6[3]);
+
+ if (flag == CNAT_DB_CLEAR_SPECIFIC) {
+ NAT64_STFUL_DEBUG_PRINT(3, "\n clear specific \n");
+
+ db = nat64_bib_db_lookup_entry(&ki);
+ if (db == NULL) {
+ NAT64_STFUL_DEBUG_PRINT(3, "\n clear specific - not present\n");
+ mp->rc = CNAT_NOT_FOUND_ANY;
+ return;
+ }
+
+ if( !(db->flags & CNAT_DB_NAT64_FLAG) ||
+ (db->nat64_inst_id != nat64_id))
+ return;
+
+
+ nat64_reset_session_expiry(db);
+ return;
+ }
+
+ pool_header_t *p = pool_header(nat64_bib_db);
+
+ for(index = 0; index < vec_len(nat64_bib_db); index++) {
+
+ /* check is it nat44, if yes skip , do it n nat44 as well */
+
+ if (PREDICT_FALSE(!clib_bitmap_get(p->free_bitmap, index))) {
+ db = nat64_bib_db + index;
+
+ if( !(db->flags & CNAT_DB_NAT64_FLAG) ||
+ (db->nat64_inst_id != nat64_id))
+ continue;
+
+ if (flag == CNAT_DB_CLEAR_ALL) {
+ nat64_reset_session_expiry(db);
+ continue;
+ }
+
+ if (flag & CNAT_DB_CLEAR_ADDR) {
+ if ((db->v6_in_key.ipv6[0] != ki.ipv6[0]) ||
+ (db->v6_in_key.ipv6[1] != ki.ipv6[1]) ||
+ (db->v6_in_key.ipv6[2] != ki.ipv6[2]) ||
+ (db->v6_in_key.ipv6[3] != ki.ipv6[3])){
+ NAT64_STFUL_DEBUG_PRINT(3, "\n%s:%d\n", __FUNCTION__, \
+ __LINE__ );
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PROTO) {
+ if (((db->v6_in_key.vrf & CNAT_PRO_MASK) >> CNAT_PRO_SHIFT)
+ != proto) {
+ NAT64_STFUL_DEBUG_PRINT(3, "\n%s:%d\n", __FUNCTION__, \
+ __LINE__ );
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PORT) {
+ if (db->v6_in_key.port != port) {
+ NAT64_STFUL_DEBUG_PRINT(3, "\n%s:%d\n", __FUNCTION__, \
+ __LINE__ );
+ continue;
+ }
+ }
+
+ NAT64_STFUL_DEBUG_PRINT(3, "\n%s:%d\n", __FUNCTION__, \
+ __LINE__ );
+ /*
+ * Delete if the db entry matches and it is not a
+ */
+ nat64_reset_session_expiry(db);
+ }
+ }
+}
+
+void inline cnat_clear_session_db(cnat_main_db_entry_t *db)
+{
+ if(PREDICT_FALSE(db->nsessions > 1)) {
+ u32 session_index = db->session_head_index;
+ cnat_session_entry_t *sdb;
+ do {
+ sdb = cnat_session_db + session_index;
+ if(PREDICT_FALSE(!sdb)) {
+ //TO DO: Debug msg?
+ break;
+ }
+ sdb->entry_expires = 0;
+ session_index = sdb->main_list.next;
+ } while(session_index != db->session_head_index
+ && db->session_head_index != EMPTY);
+ }
+ return;
+}
+
+#ifdef CGSE_DS_LITE
+extern dslite_table_entry_t dslite_table_array[];
+
+void spp_api_ds_lite_clear_db_request_t_handler
+(spp_api_ds_lite_clear_db_request_t *mp)
+{
+ u16 port, proto, flag;
+ u32 index;
+ u32 i;
+ cnat_main_db_entry_t *db;
+ cnat_user_db_entry_t *udb;
+ dslite_key_t ki;
+ dslite_table_entry_t *my_table_db_ptr;
+ u16 id;
+ u16 i_vrf;
+
+
+ id = spp_net_to_host_byte_order_16(&mp->ds_lite_id);
+ id = DS_LITE_CONFIG_TO_ARRAY_ID(id);
+
+ my_table_db_ptr = &dslite_table_array[id];
+ i_vrf = my_table_db_ptr->i_vrf;
+
+ port = spp_net_to_host_byte_order_16(&mp->port_num);
+ proto = mp->protocol;
+
+ ki.ipv4_key.k.vrf = i_vrf;
+ ki.ipv4_key.k.vrf |= ((u16)proto << CNAT_PRO_SHIFT);
+
+ for(i =0 ; i< 4 ; i++)
+ ki.ipv6[i] = spp_net_to_host_byte_order_32(&mp->ip_addr[i]);
+
+ ki.ipv4_key.k.port = port;
+
+ flag = mp->flags;
+
+ mp->rc = CNAT_SUCCESS;
+
+ DSLITE_PRINTF(3, "\n dslite id = %d, port =%d"
+ "proto =%d, flags=0x%08X",\
+ id, port, proto, flag);
+
+ DSLITE_PRINTF(3, "\n IPv6 Addr = %08X : %08X: %08X: %08X",\
+ ki.ipv6[0], ki.ipv6[1], ki.ipv6[2], ki.ipv6[3]);
+
+ if (flag == CNAT_DB_CLEAR_SPECIFIC) {
+ DSLITE_PRINTF(3, "\n Clear specific NOT supported for DS Lite \n");
+ return;
+ }
+
+ pool_header_t *p = pool_header(cnat_main_db);
+
+ for(index = 0; index < vec_len(cnat_main_db); index++) {
+
+ /* check is it dslite entry, if not skip */
+
+ if (PREDICT_FALSE(!clib_bitmap_get(p->free_bitmap, index))) {
+ db = cnat_main_db + index;
+
+ if( !(db->flags & CNAT_DB_DSLITE_FLAG) ||
+ ((db->in2out_key.k.vrf & CNAT_VRF_MASK) != i_vrf) ||
+ (db->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ continue;
+ }
+
+ if (flag == CNAT_DB_CLEAR_ALL) {
+
+ /*
+ * Make the entry time as very old (0), and wait
+ * for a timeout to auto-expire the entry.
+ */
+ db->entry_expires = 0;
+ /* Handle sessions as well.. */
+ cnat_clear_session_db(db);
+ continue;
+ }
+
+ if (flag & CNAT_DB_CLEAR_ADDR) {
+ udb = cnat_user_db + db->user_index;
+ if(PREDICT_FALSE(!udb)) {
+ continue;
+ }
+ if ((udb->ipv6[0] != ki.ipv6[0]) ||
+ (udb->ipv6[1] != ki.ipv6[1]) ||
+ (udb->ipv6[2] != ki.ipv6[2]) ||
+ (udb->ipv6[3] != ki.ipv6[3])) {
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PROTO) {
+ if (((db->in2out_key.k.vrf & CNAT_PRO_MASK) >> CNAT_PRO_SHIFT)
+ != proto) {
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PORT) {
+ if (db->in2out_key.k.port != port) {
+ continue;
+ }
+ }
+
+ /*
+ * Mark for expiry in the next round of DB scan
+ */
+ db->entry_expires = 0;
+ /* Handle sessions as well.. */
+ cnat_clear_session_db(db);
+ }
+ }
+}
+#endif /* #ifdef CGSE_DS_LITE */
+
+void spp_api_cnat_clear_db_request_t_handler
+(spp_api_cnat_clear_db_request_t *mp)
+{
+ u16 i_vrf, port, proto, flag;
+ u32 ip_addr, index;
+ u64 a,b,c;
+ cnat_main_db_entry_t * db;
+ cnat_db_key_bucket_t ki;
+
+#if defined(TARGET_LINUX_UDVR) || defined(CNAT_PG)
+ i_vrf = mp->inside_vrf;
+ ip_addr = mp->ip_addr;
+ port = mp->port_num;
+ proto = mp->protocol;
+#else
+ i_vrf = spp_net_to_host_byte_order_16(&mp->inside_vrf);
+ ip_addr = spp_net_to_host_byte_order_32(&mp->ip_addr);
+ port = spp_net_to_host_byte_order_16(&mp->port_num);
+ proto = spp_net_to_host_byte_order_16(&mp->protocol);
+#endif
+
+
+
+ ki.k.k.vrf = i_vrf;
+ ki.k.k.vrf |= ((u16)proto << CNAT_PRO_SHIFT);
+ ki.k.k.ipv4 = ip_addr;
+ ki.k.k.port = port;
+
+ flag = mp->wildcard;
+
+ mp->rc = CNAT_SUCCESS;
+
+ if (flag == CNAT_DB_CLEAR_SPECIFIC) {
+ CNAT_V4_GET_HASH(ki.k.key64,
+ ki.bucket,
+ CNAT_MAIN_HASH_MASK);
+ index = cnat_in2out_hash[ki.bucket].next;
+ if (PREDICT_TRUE(index == EMPTY)) {
+ mp->rc = CNAT_NOT_FOUND_ANY;
+ return;
+ }
+
+ do {
+ db = cnat_main_db + index;
+
+ /*
+ * Delete if the db entry matches and it is not a
+ * STATIC port entry
+ */
+ if ((db->in2out_key.key64 == ki.k.key64) &&
+ !(db->flags & CNAT_DB_FLAG_STATIC_PORT) &&
+ !(db->flags & CNAT_DB_NAT64_FLAG) &&
+ !(db->flags & CNAT_DB_DSLITE_FLAG)) {
+
+ /*
+ * Make the entry time as very old (0), and wait
+ * for a timeout to auto-expire the entry.
+ */
+ db->entry_expires = 0;
+ /* Handle sessions as well.. */
+ cnat_clear_session_db(db);
+ return;
+ }
+ index = db->in2out_hash.next;
+ } while (index != EMPTY);
+
+ mp->rc = CNAT_NOT_FOUND_ANY;
+ return;
+ }
+
+ pool_header_t *p = vec_header(cnat_main_db, sizeof(pool_header_t));
+
+ for(index = 0; index < vec_len(cnat_main_db); index++) {
+
+ if (PREDICT_TRUE(!clib_bitmap_get(p->free_bitmap, index))) {
+ db = cnat_main_db + index;
+
+ if(PREDICT_FALSE(db->flags & CNAT_DB_NAT64_FLAG)) {
+ continue;
+ }
+
+ if(PREDICT_FALSE(db->flags & CNAT_DB_DSLITE_FLAG)) {
+ continue;
+ }
+
+ if (flag == CNAT_DB_CLEAR_ALL) {
+ if (!(db->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ db->entry_expires = 0;
+ /* Handle sessions as well.. */
+ cnat_clear_session_db(db);
+ }
+ continue;
+ }
+
+ if (flag & CNAT_DB_CLEAR_VRF) {
+ if (((db->in2out_key.k.vrf & CNAT_VRF_MASK) != i_vrf)) {
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_ADDR) {
+ if ((db->in2out_key.k.ipv4 != ip_addr)) {
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PROTO) {
+ if (((db->in2out_key.k.vrf & CNAT_PRO_MASK) >> CNAT_PRO_SHIFT)
+ != proto) {
+ continue;
+ }
+ }
+
+ if (flag & CNAT_DB_CLEAR_PORT) {
+ if (db->in2out_key.k.port != port) {
+ continue;
+ }
+ }
+
+ /*
+ * Delete if the db entry matches and it is not a
+ * STATIC port entry
+ */
+ if (!(db->flags & CNAT_DB_FLAG_STATIC_PORT)) {
+ db->entry_expires = 0;
+ /* Handle sessions as well.. */
+ cnat_clear_session_db(db);
+ }
+ }
+ }
+}
+
+void
+spp_api_cnat_generic_command_debug (cnat_generic_command_resp *mp_resp)
+{
+#ifdef SHOW_DEBUG
+ u32 i, j;
+
+ i = spp_net_to_host_byte_order_32(&(mp_resp->num_bytes));
+
+ PLATFORM_DEBUG_PRINT("\nNum_Bytes %d\n", i);
+
+ for (j = 0; j < i; j++) {
+ PLATFORM_DEBUG_PRINT("0x%02X ", mp_resp->raw_data[j]);
+ if ((j % 16) == 15) {
+ PLATFORM_DEBUG_PRINT("\n");
+ }
+ }
+#endif
+}
+
+/*
+ * The following commands implements command to dump the
+ * user-db information
+ * port-map information
+ * for a give user source IP address
+ *
+ * The format of the output is:
+ * Word 0: Address of udb
+ * Word 1: udb->translation_list_head_index
+ * Word 2:
+ * Bytes 0..1: udb->ntranslations
+ * Bytes 2..2: udb->icmp_msg_coung
+ * Bytes 3..3: udb->unused
+ * Word 3: udb->portmap_index
+ * Word 4: udb->key.k.ipv4
+ * Word 5:
+ * Bytes 0..1: udb->key.k.port = 0
+ * Bytes 2..3: udb->key.k.vrf
+ * Word 6: udb->user_hash
+ * Word 7: Address of my_pm
+ * Word 8: my_pm->status
+ * Word 9: my_pm->inuse
+ * Word A: my_pm->delete_time
+ * Word B: my_pm->ipv4_address
+ */
+void spp_api_cnat_generic_command_user_db_pm
+(spp_api_cnat_generic_command_request_t *mp)
+{
+ u32 i;
+ cnat_db_key_bucket_t u_ki;
+ u16 my_vrfmap_index;
+ u32 *result_array;
+ cnat_generic_command_resp *mp_resp;
+ cnat_user_db_entry_t *udb;
+ cnat_user_db_entry_t *mp_udb;
+ cnat_vrfmap_t *my_vrfmap;
+ cnat_portmap_v2_t *pm;
+ cnat_portmap_v2_t *my_pm;
+
+ /*
+ * Request structure is used to send the response
+ */
+ mp_resp = (cnat_generic_command_resp *) mp;
+
+ u_ki.k.k.vrf = spp_net_to_host_byte_order_32(&mp->params[1]);
+ u_ki.k.k.ipv4 = spp_net_to_host_byte_order_32(&mp->params[2]);
+ u_ki.k.k.port = 0;
+
+ udb = cnat_user_db_lookup_entry(&u_ki);
+
+ if (!udb) {
+ mp_resp->num_bytes = spp_host_to_net_byte_order_32(0);
+ goto no_udb_found;
+ }
+
+ result_array = (u32 *) (&(mp_resp->raw_data[0]));
+
+ i = 0;
+ result_array[i++] = spp_host_to_net_byte_order_32((u32) udb);
+
+ mp_udb = (cnat_user_db_entry_t *) &(result_array[i]);
+
+ /*
+ * Align the entry to the next 4 byte boundary
+ */
+ i = i + ((sizeof(cnat_user_db_entry_t)+3)/4);
+
+ /*
+ * Fill in the UDB information
+ */
+ mp_udb->translation_list_head_index =
+ spp_host_to_net_byte_order_32(udb->translation_list_head_index);
+ mp_udb->ntranslations =
+ spp_host_to_net_byte_order_16(udb->ntranslations);
+ mp_udb->icmp_msg_count = udb->icmp_msg_count;
+ mp_udb->flags = udb->flags;
+ mp_udb->portmap_index =
+ spp_host_to_net_byte_order_32(udb->portmap_index);
+ mp_udb->key.k.ipv4 =
+ spp_host_to_net_byte_order_32(udb->key.k.ipv4);
+ mp_udb->key.k.port =
+ spp_host_to_net_byte_order_16(udb->key.k.port);
+ mp_udb->key.k.vrf =
+ spp_host_to_net_byte_order_16(udb->key.k.vrf);
+ mp_udb->user_hash.next =
+ spp_host_to_net_byte_order_32(udb->user_hash.next);
+
+ my_vrfmap_index = vrf_map_array[u_ki.k.k.vrf];
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+ pm = my_vrfmap->portmap_list;
+ my_pm = pm + udb->portmap_index;
+
+ /*
+ * Fill in the port_map information
+ */
+ result_array[i++] = spp_host_to_net_byte_order_32((u32) my_pm);
+ result_array[i++] = spp_host_to_net_byte_order_32(my_pm->inuse);
+ result_array[i++] = spp_host_to_net_byte_order_32(my_pm->delete_time);
+ result_array[i++] = spp_host_to_net_byte_order_32(my_pm->ipv4_address);
+
+ mp_resp->num_bytes = spp_host_to_net_byte_order_32(i*4);
+
+no_udb_found:
+ spp_api_cnat_generic_command_debug(mp_resp);
+}
+
+/*
+ * The following commands implements command to dump the
+ * DB usage stats for
+ * main-db
+ * user-db
+ * in2out hash
+ * out2in hash
+ *
+ * The format of the output is:
+ * Word 0: Main-DB - Total
+ * Word 1: Main-DB - Active
+ * Word 2: Main-DB - Free
+ * Word 3: User-DB - Total
+ * Word 4: User-DB - Active
+ * Word 5: User-DB - Free
+ * Word 6: Hash In2Out - Size
+ * Word 7: Hash In2Out - Used
+ * Word 8: Hash In2Out - Used Percentage
+ * Word 9: Hash Out2In - Size
+ * Word A: Hash Out2In - Used
+ * Word B: Hash Out2In - Used Percentage
+ */
+void spp_api_cnat_generic_command_db_summary
+(spp_api_cnat_generic_command_request_t *mp)
+{
+ u32 count1, count2, count3;
+ u32 i = 0;
+ u32 k = 0;
+ cnat_generic_command_resp *mp_resp;
+ u32 *result_array;
+
+ /*
+ * Request structure is used to send the response
+ */
+ mp_resp = (cnat_generic_command_resp *) mp;
+ result_array = (u32 *) (&(mp_resp->raw_data[0]));
+
+ /*
+ * Find entries free and used in main-db
+ */
+ count1 = vec_len(cnat_main_db);
+ count2 = db_free_entry(cnat_main_db);
+ count3 = count1 - count2;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count1);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count3);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count2);
+
+ /*
+ * Find entries free and used in user-db
+ */
+ count1 = vec_len(cnat_user_db);
+ count2 = db_free_entry(cnat_user_db);
+ count3 = count1 - count2;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count1);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count3);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count2);
+
+ /*
+ * Find entries used in in2out and out2in hash tables
+ * and percentage utilization.
+ */
+ count1 = count2 = 0;
+ for (k = 0; k < CNAT_MAIN_HASH_SIZE; k++) {
+ if(cnat_in2out_hash[k].next != ~0) count1++;
+ if(cnat_out2in_hash[k].next != ~0) count2++;
+
+ }
+
+ count3 = count1*100/CNAT_MAIN_HASH_SIZE;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(CNAT_MAIN_HASH_SIZE);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count1);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count3);
+
+ count3 = count2*100/CNAT_MAIN_HASH_SIZE;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(CNAT_MAIN_HASH_SIZE);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count2);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(count3);
+
+ mp_resp->num_bytes = spp_host_to_net_byte_order_32(i*4);
+
+ spp_api_cnat_generic_command_debug(mp_resp);
+}
+
+/*
+ * The following commands implements generic commands such as:
+ *
+ * Command 1:
+ * Reads num_bytes octets from a start_locn
+ * generic command <core_num> <cmd_type=1> <start_locn> <num_bytes> 0 0 0 0 0
+ *
+ * Command 2:
+ * Writes upto 8 octets from a start_locn
+ * generic command <core_num> <cmd_type=2> <start_locn> <num_bytes> 0 0 0 0 0
+ *
+ * Command 3:
+ * Dump the db summary stats
+ * generic command <core_num> <cmd_type=3>
+ *
+ * Command 4:
+ * Dump the user db entry
+ * generic command <core_num> <cmd_type=4> <vrf_id> <src_ip_addr>
+ *
+ * The following structures are referenced by this command:
+ * typedef struct _spp_api_cnat_generic_command_request {
+ * u16 _spp_msg_id;
+ * u8 rc;
+ * u8 core_num;
+ * u32 params[8];
+ * } spp_api_cnat_generic_command_request_t;
+ *
+ * typedef struct {
+ * u16 spp_msg_id;
+ * u8 rc;
+ * u8 core;
+ * u32 num_bytes;
+ * u8 raw_data[0];
+ * } cnat_generic_command_resp;
+ *
+ */
+void spp_api_cnat_generic_command_request_t_handler
+(spp_api_cnat_generic_command_request_t *mp)
+{
+ cnat_generic_command_resp *resp_ptr;
+ u32 command_type, start_locn, num_bytes;
+
+ command_type = spp_net_to_host_byte_order_32(&mp->params[0]);
+ resp_ptr = (cnat_generic_command_resp *) mp;
+
+ switch (command_type) {
+ case CNAT_DEBUG_GENERIC_COMMAND_READ_MEM:
+ start_locn = spp_net_to_host_byte_order_32(&mp->params[1]);
+ num_bytes = spp_net_to_host_byte_order_32(&mp->params[2]);
+ memcpy(&(resp_ptr->raw_data[0]), (u8 *) start_locn, num_bytes);
+ resp_ptr->num_bytes = spp_host_to_net_byte_order_32(num_bytes);
+
+#ifdef SHOW_DEBUG
+ {
+ u32 i;
+
+ for (i = 0; i < num_bytes; i++) {
+ PLATFORM_DEBUG_PRINT("0x%02X ", resp_ptr->raw_data[i]);
+ if ((i % 16) == 15) {
+ PLATFORM_DEBUG_PRINT("\n");
+ }
+ }
+ }
+#endif
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_WRITE_MEM:
+ start_locn = spp_net_to_host_byte_order_32(&mp->params[1]);
+ num_bytes = spp_net_to_host_byte_order_32(&mp->params[2]);
+
+ if (num_bytes > sizeof(u64)) {
+ mp->rc = CNAT_ERR_INVALID_MSG_SIZE;
+ return;
+ }
+
+ memcpy((u8 *) start_locn, &(mp->params[3]), num_bytes);
+ resp_ptr->num_bytes = 0;
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_DB_SUMMARY:
+ spp_api_cnat_generic_command_db_summary(mp);
+ break;
+
+ case CNAT_DEBUG_GENERIC_COMMAND_USER_DB_PM:
+ spp_api_cnat_generic_command_user_db_pm(mp);
+ break;
+
+ case CNAT_DEBUG_GET_CGN_DB_SUMMARY:
+ spp_api_cnat_get_cgn_db_summary(mp);
+ break;
+
+ default:
+ mp->rc = CNAT_ERR_INVALID_MSG_ID;
+ break;
+ }
+}
+
+
+static int cnat_debug_init (void *notused)
+{
+ spp_msg_api_set_handler(SPP_API_CNAT_V4_DEBUG_DUMMY,
+ spp_api_cnat_v4_debug_dummy_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_V4_DEBUG_DUMMY_MAX,
+ spp_api_cnat_v4_debug_dummy_max_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_V4_DEBUG_GLOBAL,
+ spp_api_cnat_v4_debug_global_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_V4_DEBUG_IN2OUT_PRIVATE_ADDR,
+ spp_api_cnat_v4_debug_in2out_private_addr_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_V4_DEBUG_OUT2IN_PUBLIC_ADDR,
+ spp_api_cnat_v4_debug_out2in_public_addr_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_CLEAR_DB_REQUEST,
+ spp_api_cnat_clear_db_request_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_GENERIC_COMMAND_REQUEST,
+ spp_api_cnat_generic_command_request_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_P2MP_DEBUG_REQUEST,
+ spp_api_cnat_p2mp_debug_request_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_NAT64_CLEAR_DB_REQUEST,
+ spp_api_nat64_clear_db_request_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_DS_LITE_CLEAR_DB_REQUEST,
+ spp_api_ds_lite_clear_db_request_t_handler);
+
+ return 0;
+}
+
+/*
+************************
+* spp_api_cnat_get_cgn_db_summary
+* This is for finding out the per core CPU users and utilization
+************************
+*/
+
+void spp_api_cnat_get_cgn_db_summary
+(spp_api_cnat_generic_command_request_t *mp)
+{
+ u32 total_db_entries, total_free_entries, used_entries;
+ u32 i = 0;
+ cnat_generic_command_resp *mp_resp;
+ u32 *result_array;
+
+ /*
+ * Request structure is used to send the response
+ */
+ mp_resp = (cnat_generic_command_resp *) mp;
+ result_array = (u32 *) (&(mp_resp->raw_data[0]));
+
+ /*
+ * Find entries free and used in main-db
+ */
+ total_db_entries = vec_len(cnat_main_db);
+ total_free_entries = db_free_entry(cnat_main_db);
+ used_entries = total_db_entries - total_free_entries;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(total_db_entries);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(used_entries);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(total_free_entries);
+
+ /*
+ * Find entries free and used in user-db
+ */
+ total_db_entries = vec_len(cnat_user_db);
+ total_free_entries = db_free_entry(cnat_user_db);
+ used_entries = total_db_entries - total_free_entries;
+
+ *(result_array + i++) = spp_host_to_net_byte_order_32(total_db_entries);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(used_entries);
+ *(result_array + i++) = spp_host_to_net_byte_order_32(total_free_entries);
+
+ mp_resp->num_bytes = spp_host_to_net_byte_order_32(i*sizeof(u32));
+}
+
+SPP_INIT_FUNCTION(cnat_debug_init);
+#endif /* TOBE_PORTED */
diff --git a/vnet/vnet/vcgn/cnat_global.c b/vnet/vnet/vcgn/cnat_global.c
new file mode 100644
index 00000000000..9ab89eeaf05
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_global.c
@@ -0,0 +1,79 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_global.c - global variables
+ *
+ * Copyright (c) 2008-2009, 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/* gloable variables */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "dslite_defs.h"
+#include "tcp_header_definitions.h"
+u32 cnat_current_time;
+u8 nfv9_configured = 0;
+/* ctx/sf alloc error counters */
+u32 null_enq_pkt;
+u32 null_deq_pkt;
+
+u32 null_enq_ctx;
+u32 null_deq_ctx;
+
+u32 null_enq_wqe;
+u32 null_deq_wqe;
+
+u32 ctx_alloc_errs;
+u32 sf_alloc_errs;
+
+u32 rcv_pkt_errs;
+
+/* TOBE_PORTED : Remove following once we bring DSLite */
+u32 dslite_config_debug_level = 1;
+u32 dslite_data_path_debug_level = 1;
+u32 dslite_defrag_debug_level = 1;
+u32 dslite_debug_level = 1;
+
+dslite_table_entry_t *dslite_table_db_ptr;
+
+/*
+ * ipv4_decr_ttl_n_calc_csum()
+ * - It decrements the TTL and calculates the incremental IPv4 checksum
+ */
+
+/* TOBE_PORTED: Following is in cnat_util.c */
+always_inline
+void ipv4_decr_ttl_n_calc_csum(ipv4_header *ipv4)
+{
+ u32 checksum;
+ u16 old;
+ u16 ttl;
+
+ ttl = ipv4->ttl;
+ old = clib_net_to_host_u16(ttl);
+
+ /* Decrement TTL */
+ ipv4->ttl--;
+
+ /* Calculate incremental checksum */
+ checksum = old + (~clib_net_to_host_u16(ttl) & 0xFFFF);
+ checksum += clib_net_to_host_u16(ipv4->checksum);
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ ipv4->checksum = clib_host_to_net_u32(checksum + (checksum >> 16));
+}
+
diff --git a/vnet/vnet/vcgn/cnat_global.h b/vnet/vnet/vcgn/cnat_global.h
new file mode 100644
index 00000000000..823a47974d4
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_global.h
@@ -0,0 +1,87 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_global.h - global definition and variables
+ * to be used by non cnat files
+ *
+ * Copyright (c) 2007-2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_GLOBAL_H__
+#define __CNAT_GLOBAL_H__
+
+/* gloable variables */
+
+extern u8 cnat_db_init_done;
+extern u32 cnat_current_time;
+extern u64 in2out_drops_port_limit_exceeded;
+extern u64 in2out_drops_system_limit_reached;
+extern u64 in2out_drops_resource_depletion;
+extern u64 no_translation_entry_drops;
+extern u8 nfv9_configured;
+extern u32 translation_create_count;
+extern u32 translation_create_rate;
+
+extern u32 translation_delete_count;
+extern u32 translation_delete_rate;
+
+extern u32 in2out_forwarding_count;
+extern u32 in2out_forwarding_rate;
+
+extern u32 out2in_forwarding_count;
+extern u32 out2in_forwarding_rate;
+
+extern u32 total_address_pool_allocated;
+
+extern u32 nat44_active_translations;
+
+#if 1 //DSLITE_DEF
+extern u32 dslite_translation_create_rate;
+extern u32 dslite_translation_delete_rate;
+extern u32 dslite_translation_create_count;
+extern u32 dslite_in2out_forwarding_count;
+extern u32 dslite_in2out_forwarding_count;
+extern u32 dslite_out2in_forwarding_rate;
+#endif
+/* sf/ctx allocation error collection declarations */
+#define COLLECT_FREQ_FACTOR 100
+#define NUM_SECONDS_TO_WAIT 10
+#define COUNTER_BUFFER_SIZE 25
+
+extern u32 null_enq_pkt;
+extern u32 null_deq_pkt;
+
+extern u32 null_enq_ctx;
+extern u32 null_deq_ctx;
+
+extern u32 null_enq_wqe;
+extern u32 null_deq_wqe;
+
+extern u32 ctx_alloc_errs;
+extern u32 sf_alloc_errs;
+
+extern u32 rcv_pkt_errs;
+
+struct counter_array_t {
+ u32 sf_error_counter;
+ u32 ctx_error_counter;
+ u32 timestamp;
+} counter_array_t;
+
+#define COUNTER_BUFFER_SIZE 25
+struct counter_array_t err_cnt_arr[COUNTER_BUFFER_SIZE];
+
+//#define DISABLE_ICMP_THROTTLE_FOR_DEBUG_PURPOSE
+
+#endif /*__CNAT_GLOBAL_H__*/
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp.h b/vnet/vnet/vcgn/cnat_ipv4_icmp.h
new file mode 100644
index 00000000000..69505a0364f
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp.h
@@ -0,0 +1,51 @@
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * Filename: cnat_ipv4_icmp.h
+ *
+ * Description: common functions for icmp node
+ *
+ * Assumptions and Constraints:
+ *
+ * Copyright (c) 2000-2009, 2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-----------------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_IPV4_ICMP_H__
+#define __CNAT_IPV4_ICMP_H__
+
+#include "tcp_header_definitions.h"
+#include "cnat_db.h"
+#include "cnat_v4_functions.h"
+#include "cnat_global.h"
+#include "cnat_config.h"
+
+typedef struct {
+ icmp_v4_t *icmp;
+ ipv4_header *em_ip;
+ u16 *em_port;
+ u16 *em_l4_checksum;
+} icmp_em_ip_info;
+
+inline void swap_ip_src_icmp_id(ipv4_header *ip,
+ icmp_v4_t *icmp,
+ cnat_main_db_entry_t *db,
+ u16 vrf);
+
+inline void swap_ip_dst_icmp_id(ipv4_header *ip,
+ icmp_v4_t *icmp,
+ cnat_main_db_entry_t *db,
+ u16 vrf);
+
+#endif /* __CNAT_IPV4_ICMP_H__ */
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp_error_inside_input.c b/vnet/vnet/vcgn/cnat_ipv4_icmp_error_inside_input.c
new file mode 100644
index 00000000000..218d7e538fa
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp_error_inside_input.c
@@ -0,0 +1,476 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_icmp_error_inside_input.c - cnat_ipv4_icmp_error_inside_input node pipeline stage functions
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_icmp.h"
+
+#define foreach_cnat_ipv4_icmp_e_inside_input_error \
+_(CNAT_V4_ICMP_E_I2O_T_PKT, "cnat v4 icmp_e i2o packet transmit") \
+_(CNAT_V4_ICMP_E_I2O_D_PKT, "cnat v4 icmp_e i2o packet drop") \
+_(CNAT_V4_ICMP_E_I2O_TTL_DROP, "cnat v4 icmp_e i2o ttl drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_icmp_e_inside_input_error
+#undef _
+ CNAT_IPV4_ICMP_E_INSIDE_INPUT_N_ERROR,
+} cnat_ipv4_icmp_e_inside_input_t;
+
+static char * cnat_ipv4_icmp_e_inside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_icmp_e_inside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_icmp_e_inside_input_main_t;
+
+typedef enum {
+ CNAT_V4_ICMP_E_I2O_T,
+ CNAT_V4_ICMP_E_I2O_D,
+ CNAT_V4_ICMP_E_I2O_NEXT,
+} cnat_ipv4_icmp_e_inside_input_next_t;
+
+cnat_ipv4_icmp_e_inside_input_main_t cnat_ipv4_icmp_e_inside_input_main;
+vlib_node_registration_t cnat_ipv4_icmp_e_inside_input_node;
+
+#define NSTAGES 5
+
+inline void swap_ip_src_emip_dst(ipv4_header *ip,
+ icmp_em_ip_info *icmp_info,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+ icmp_v4_t *icmp;
+ ipv4_header *em_ip;
+ u16 *em_port;
+ u32 old_ip;
+ u16 old_port;
+ u16 old_ip_checksum;
+
+ /*
+ * declear variable
+ */
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM_DECLARE
+
+ /*
+ * fix inner layer ip & l4 checksum
+ */
+ em_ip = icmp_info->em_ip;
+ em_port = icmp_info->em_port;
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ (clib_net_to_host_u16(em_ip->checksum)),
+ ((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)))
+
+ old_ip = clib_net_to_host_u32(em_ip->dest_addr);
+ old_port = clib_net_to_host_u16(*em_port);
+ old_ip_checksum = clib_net_to_host_u16(em_ip->checksum);
+
+ em_ip->dest_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+ em_ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+ *em_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+ /*
+ * fix outter layer ip & icmp checksum
+ */
+ icmp = icmp_info->icmp;
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (old_port),
+ (old_ip_checksum),
+ (clib_net_to_host_u16(icmp->checksum)),
+ ((u16)(db->out2in_key.k.ipv4 & 0xffff)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ ((u16)(db->out2in_key.k.port)),
+ ((u16)(new_l3_c)))
+
+ icmp->checksum =
+ clib_host_to_net_u16(new_icmp_c);
+
+ old_ip = clib_net_to_host_u32(ip->src_addr);
+
+ ip->src_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (clib_net_to_host_u16(ip->checksum)),
+ ((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)))
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+#if 0
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ /*
+ * fix inner layer ip & l4 checksum
+ */
+ em_snat_ip = icmp_info->em_ip;
+ em_snat_port = icmp_info->em_port;
+
+ old_ip = spp_net_to_host_byte_order_32(&(em_snat_ip->src_addr));
+ old_port = spp_net_to_host_byte_order_16(em_snat_port);
+ old_ip_checksum = spp_net_to_host_byte_order_16(&(em_snat_ip->checksum));
+ direction = 0;
+ if(cnat_static_dest_db_get_translation(em_snat_ip->src_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(em_snat_ip->checksum))),
+ ((u16)(old_postmap_ip)),
+ ((u16)(old_postmap_ip >> 16)))
+ em_snat_ip->src_addr = postmap_ip;
+ em_snat_ip->checksum =
+ spp_host_to_net_byte_order_16(new_l3_c);
+
+ /*
+ * fix outter layer ip & icmp checksum
+ */
+ icmp = icmp_info->icmp;
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (old_port),
+ (old_ip_checksum),
+ (spp_net_to_host_byte_order_16(&(icmp->checksum))),
+ ((u16)(old_postmap_ip & 0xffff)),
+ ((u16)(old_postmap_ip >> 16)),
+ ((u16)(old_port)),
+ ((u16)(new_l3_c)))
+
+ icmp->checksum =
+ spp_host_to_net_byte_order_16(new_icmp_c);
+
+ }
+ }
+
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 0;
+ if(cnat_static_dest_db_get_translation(ip->dest_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+
+ old_ip = spp_net_to_host_byte_order_32(&(ip->dest_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)))
+ ip->dest_addr = postmap_ip;
+
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+ }
+ }
+#endif /* if 0 */
+
+}
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ ipv4_header *em_ip = (ipv4_header*)((u8*)icmp + 8); /* embedded pkt's v4 hdr */
+ u8 em_ip_hdr_len = (em_ip->version_hdr_len_words & 0xf) << 2;
+
+ u64 tmp = 0;
+ u32 protocol = CNAT_ICMP;
+
+ /* Check L4 header for embedded packet */
+ if (em_ip->protocol == TCP_PROT) {
+ tcp_hdr_type *tcp = (tcp_hdr_type*)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(tcp->dest_port);
+ protocol = CNAT_TCP;
+
+ } else if (em_ip->protocol == UDP_PROT) {
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(udp->dest_port);
+ protocol = CNAT_UDP;
+
+ } else {
+ icmp_v4_t *icmp = (icmp_v4_t*)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(icmp->identifier);
+
+ if (PREDICT_FALSE((icmp->type != ICMPV4_ECHOREPLY) &&
+ (icmp->type != ICMPV4_ECHO))) {
+ /*
+ * Try to set invalid protocol for these cases, so that
+ * hash lookup does not return valid main_db. This approach
+ * may optimize the regular cases with valid protocols
+ * as it avoids one more check for regular cases in stage3
+ */
+ protocol = CNAT_INVALID_PROTO;
+ }
+ }
+
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(em_ip->dest_addr);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ protocol)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_in2out_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_in2out_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->in2out_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ int disposition = CNAT_V4_ICMP_E_I2O_T;
+ int counter = CNAT_V4_ICMP_E_I2O_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ ipv4_header *em_ip = (ipv4_header*)((u8*)icmp + 8); /* embedded pkt's v4 hdr */
+ u8 em_ip_hdr_len = (em_ip->version_hdr_len_words & 0xf) << 2;
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_icmp_e_inside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_main_db_entry_t *db = NULL;
+ icmp_em_ip_info icmp_info;
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /*
+ * As it is ICMP error packet with TTL <= 1,
+ * let's drop the packet (no need to genereate
+ * another ICMP error).
+ */
+
+ disposition = CNAT_V4_ICMP_E_I2O_D;
+ counter = CNAT_V4_ICMP_E_I2O_TTL_DROP;
+
+ goto drop_pkt;
+ }
+ }
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ icmp_info.em_ip = em_ip;
+ icmp_info.icmp = icmp;
+ //icmp_info.em_port = vnet_buffer(b0)->vcgn_uii.key.k.port;
+
+ /* Note: This could have been done in stage1 itself,
+ * but we need to introduce one u16 * in vnet_buffer_opaque_t
+ * Since this flow is expected to be very rare in actual
+ * deployment scenario, we may afford to do these steps here
+ * as well. Lets confirm during core review. */
+ if (em_ip->protocol == TCP_PROT) {
+ tcp_hdr_type *tcp = (tcp_hdr_type*)((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(tcp->dest_port);
+ } else if (em_ip->protocol == UDP_PROT) {
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)
+ ((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(udp->dest_port);
+ } else {
+ icmp_v4_t *icmp_inner = (icmp_v4_t*)((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(icmp_inner->identifier);
+ }
+
+ db = cnat_main_db + db_index;
+ /*
+ * 1. update dst addr:dst port of embedded ip pkt
+ * update src addr of icmp pkt
+ * 2. fix udp/tcp/ip checksum of embedded pkt
+ * fix icmp, ip check of icmp pkt
+ * don need to update the timer
+ */
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT BEFORE\n");
+ print_icmp_pkt(ip);
+ }
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ swap_ip_src_emip_dst(ip, &icmp_info,
+ db, db->in2out_key.k.vrf);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT AFTER\n");
+ print_icmp_pkt(ip);
+ }
+ in2out_forwarding_count++;
+
+ } else {
+ disposition = CNAT_V4_ICMP_E_I2O_D;
+ counter = CNAT_V4_ICMP_E_I2O_D_PKT;
+ }
+
+drop_pkt:
+
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_icmp_e_inside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_icmp_e_inside_input_node) = {
+ .function = cnat_ipv4_icmp_e_inside_input_node_fn,
+ .name = "vcgn-v4-icmp-e-i2o",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_icmp_e_inside_input_error_strings),
+ .error_strings = cnat_ipv4_icmp_e_inside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_ICMP_E_I2O_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_ICMP_E_I2O_T] = "ip4-input",
+ [CNAT_V4_ICMP_E_I2O_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_icmp_e_inside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_icmp_e_inside_input_main_t * mp = &cnat_ipv4_icmp_e_inside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_icmp_e_inside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp_error_outside_input.c b/vnet/vnet/vcgn/cnat_ipv4_icmp_error_outside_input.c
new file mode 100644
index 00000000000..f25f4d022c7
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp_error_outside_input.c
@@ -0,0 +1,452 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_icmp_error_outside_input.c - cnat_ipv4_icmp_error_outside_input node pipeline stage functions
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_icmp.h"
+
+#define foreach_cnat_ipv4_icmp_e_outside_input_error \
+_(CNAT_V4_ICMP_E_O2I_T_PKT, "cnat v4 icmp_e o2i packet transmit") \
+_(CNAT_V4_ICMP_E_O2I_D_PKT, "cnat v4 icmp_e o2i packet drop") \
+_(CNAT_V4_ICMP_E_O2I_TTL_DROP, "cnat v4 icmp_e o2i ttl drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_icmp_e_outside_input_error
+#undef _
+ CNAT_IPV4_ICMP_E_OUTSIDE_INPUT_N_ERROR,
+} cnat_ipv4_icmp_e_outside_input_t;
+
+static char * cnat_ipv4_icmp_e_outside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_icmp_e_outside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_icmp_e_outside_input_main_t;
+
+typedef enum {
+ CNAT_V4_ICMP_E_O2I_T,
+ CNAT_V4_ICMP_E_O2I_D,
+ CNAT_V4_ICMP_E_O2I_NEXT,
+} cnat_ipv4_icmp_e_outside_input_next_t;
+
+cnat_ipv4_icmp_e_outside_input_main_t cnat_ipv4_icmp_e_outside_input_main;
+vlib_node_registration_t cnat_ipv4_icmp_e_outside_input_node;
+
+#define NSTAGES 5
+
+inline void swap_ip_dst_emip_src(ipv4_header *ip,
+ icmp_em_ip_info *icmp_info,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+ icmp_v4_t *icmp;
+ ipv4_header *em_ip;
+ u16 *em_port;
+ u32 old_ip;
+ u16 old_port;
+ u16 old_ip_checksum;
+
+ /*
+ * declear variable
+ */
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM_DECLARE
+
+ /*
+ * fix inner layer ip & l4 checksum
+ */
+ em_ip = icmp_info->em_ip;
+ em_port = icmp_info->em_port;
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (clib_net_to_host_u16(em_ip->checksum)),
+ ((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)))
+
+ old_ip = clib_net_to_host_u32(em_ip->src_addr);
+ old_port = clib_net_to_host_u16(*em_port);
+ old_ip_checksum = clib_net_to_host_u16(em_ip->checksum);
+
+ em_ip->src_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ em_ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+ *em_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ /*
+ * fix outter layer ip & icmp checksum
+ */
+ icmp = icmp_info->icmp;
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (old_port),
+ (old_ip_checksum),
+ (clib_net_to_host_u16(icmp->checksum)),
+ ((u16)(db->in2out_key.k.ipv4 & 0xffff)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ ((u16)(db->in2out_key.k.port)),
+ ((u16)(new_l3_c)))
+
+ icmp->checksum =
+ clib_host_to_net_u16(new_icmp_c);
+
+ old_ip = clib_net_to_host_u32(ip->dest_addr);
+
+ ip->dest_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (clib_net_to_host_u16(ip->checksum)),
+ ((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)))
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+#if 0
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ /*
+ * fix inner layer ip & l4 checksum
+ */
+ em_snat_ip = icmp_info->em_ip;
+ em_snat_port = icmp_info->em_port;
+
+ old_ip = spp_net_to_host_byte_order_32(&(em_snat_ip->dest_addr));
+ old_port = spp_net_to_host_byte_order_16(em_snat_port);
+ old_ip_checksum = spp_net_to_host_byte_order_16(&(em_snat_ip->checksum));
+ direction = 1;
+ if(cnat_static_dest_db_get_translation(em_snat_ip->dest_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(em_snat_ip->checksum))),
+ ((u16)(old_postmap_ip)),
+ ((u16)(old_postmap_ip >> 16)))
+ em_snat_ip->dest_addr = postmap_ip;
+ em_snat_ip->checksum =
+ spp_host_to_net_byte_order_16(new_l3_c);
+
+ /*
+ * fix outter layer ip & icmp checksum
+ */
+ icmp = icmp_info->icmp;
+ CNAT_UPDATE_ICMP_ERR_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (old_port),
+ (old_ip_checksum),
+ (spp_net_to_host_byte_order_16(&(icmp->checksum))),
+ ((u16)(old_postmap_ip & 0xffff)),
+ ((u16)(old_postmap_ip >> 16)),
+ ((u16)(old_port)),
+ ((u16)(new_l3_c)))
+
+ icmp->checksum =
+ spp_host_to_net_byte_order_16(new_icmp_c);
+
+ }
+ }
+
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 1;
+ if(cnat_static_dest_db_get_translation(ip->src_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+
+ old_ip = spp_net_to_host_byte_order_32(&(ip->src_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)))
+ ip->checksum =
+ spp_host_to_net_byte_order_16(new_l3_c);
+ ip->src_addr = postmap_ip;
+ }
+ }
+#endif /* if 0 */
+}
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ ipv4_header *em_ip = (ipv4_header*)((u8*)icmp + 8); /* embedded pkt's v4 hdr */
+ u8 em_ip_hdr_len = (em_ip->version_hdr_len_words & 0xf) << 2;
+
+ u64 tmp = 0;
+ u32 protocol = CNAT_ICMP;
+
+ /* Check L4 header for embedded packet */
+ if (em_ip->protocol == TCP_PROT) {
+ tcp_hdr_type *tcp = (tcp_hdr_type*)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(tcp->src_port);
+ protocol = CNAT_TCP;
+
+ } else if (em_ip->protocol == UDP_PROT) {
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(udp->src_port);
+ protocol = CNAT_UDP;
+
+ } else {
+ icmp_v4_t *icmp = (icmp_v4_t*)((u8 *)em_ip + em_ip_hdr_len);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16(icmp->identifier);
+
+ if (PREDICT_FALSE((icmp->type != ICMPV4_ECHOREPLY) &&
+ (icmp->type != ICMPV4_ECHO))) {
+ /*
+ * Try to set invalid protocol for these cases, so that
+ * hash lookup does not return valid main_db. This approach
+ * may optimize the regular cases with valid protocols
+ * as it avoids one more check for regular cases in stage3
+ */
+ protocol = CNAT_INVALID_PROTO;
+ }
+ }
+
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(em_ip->src_addr);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ protocol)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_out2in_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_out2in_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->out2in_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->out2in_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ int disposition = CNAT_V4_ICMP_E_O2I_T;
+ int counter = CNAT_V4_ICMP_E_O2I_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ ipv4_header *em_ip = (ipv4_header*)((u8*)icmp + 8); /* embedded pkt's v4 hdr */
+ u8 em_ip_hdr_len = (em_ip->version_hdr_len_words & 0xf) << 2;
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_icmp_e_outside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_main_db_entry_t *db = NULL;
+ icmp_em_ip_info icmp_info;
+
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ icmp_info.em_ip = em_ip;
+ icmp_info.icmp = icmp;
+
+ /* Note: This could have been done in stage1 itself,
+ * but we need to introduce one u16 * in vnet_buffer_opaque_t
+ * Since this flow is expected to be very rare in actual
+ * deployment scenario, we may afford to do these steps here
+ * as well. Lets confirm during core review. */
+
+ if (em_ip->protocol == TCP_PROT) {
+ tcp_hdr_type *tcp = (tcp_hdr_type*)((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(tcp->src_port);
+ } else if (em_ip->protocol == UDP_PROT) {
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)
+ ((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(udp->src_port);
+ } else {
+ icmp_v4_t *icmp_inner = (icmp_v4_t*)((u8 *)em_ip + em_ip_hdr_len);
+ icmp_info.em_port = &(icmp_inner->identifier);
+ }
+
+ db = cnat_main_db + db_index;
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT BEFORE\n");
+ print_icmp_pkt(ip);
+ }
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ swap_ip_dst_emip_src(ip, &icmp_info,
+ db, db->in2out_key.k.vrf);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT AFTER\n");
+ print_icmp_pkt(ip);
+ }
+
+ } else {
+ disposition = CNAT_V4_ICMP_E_O2I_D;
+ counter = CNAT_V4_ICMP_E_O2I_D_PKT;
+ }
+
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_icmp_e_outside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_icmp_e_outside_input_node) = {
+ .function = cnat_ipv4_icmp_e_outside_input_node_fn,
+ .name = "vcgn-v4-icmp-e-o2i",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_icmp_e_outside_input_error_strings),
+ .error_strings = cnat_ipv4_icmp_e_outside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_ICMP_E_O2I_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_ICMP_E_O2I_T] = "ip4-input",
+ [CNAT_V4_ICMP_E_O2I_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_icmp_e_outside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_icmp_e_outside_input_main_t * mp = &cnat_ipv4_icmp_e_outside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_icmp_e_outside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input.c b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input.c
new file mode 100644
index 00000000000..1b9f0266d71
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input.c
@@ -0,0 +1,404 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_icmp_query_inside_input.c - cnat_ipv4_icmp_query_inside_input node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_icmp.h"
+
+#define foreach_cnat_ipv4_icmp_q_inside_input_error \
+_(CNAT_V4_ICMP_Q_I2O_T_PKT, "cnat v4 icmp_q i2o packet transmit") \
+_(CNAT_V4_ICMP_Q_I2O_MISS_PKT, "cnat v4 icmp_q i2o db miss") \
+_(CNAT_V4_ICMP_Q_I2O_TTL_GEN, "cnat v4 icmp_q i2o ttl generate") \
+_(CNAT_V4_ICMP_Q_I2O_TTL_DROP, "cnat v4 icmp_q i2o ttl drop") \
+_(CNAT_V4_ICMP_Q_I2O_NO_SESSION_DROP, "cnat v4 icmp_q i2o no session drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_icmp_q_inside_input_error
+#undef _
+ CNAT_IPV4_ICMP_Q_INSIDE_INPUT_N_ERROR,
+} cnat_ipv4_icmp_q_inside_input_t;
+
+static char * cnat_ipv4_icmp_q_inside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_icmp_q_inside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_icmp_q_inside_input_main_t;
+
+typedef enum {
+ CNAT_V4_ICMP_Q_I2O_T,
+ CNAT_V4_ICMP_Q_I2O_E,
+ CNAT_V4_ICMP_Q_I2O_D,
+ CNAT_V4_ICMP_Q_I2O_NEXT,
+} cnat_ipv4_icmp_q_inside_input_next_t;
+
+cnat_ipv4_icmp_q_inside_input_main_t cnat_ipv4_icmp_q_inside_input_main;
+vlib_node_registration_t cnat_ipv4_icmp_q_inside_input_node;
+
+#define NSTAGES 5
+
+inline void swap_ip_src_icmp_id(ipv4_header *ip,
+ icmp_v4_t *icmp,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+#if 0
+ u32 postmap_ip;
+ u8 direction;
+ u32 old_ip;
+ u32 old_postmap_ip;
+
+
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 0;
+ if(cnat_static_dest_db_get_translation(ip->dest_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+
+ old_ip = spp_net_to_host_byte_order_32(&(ip->dest_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)))
+ ip->dest_addr = postmap_ip;
+
+ ip->checksum =
+ spp_host_to_net_byte_order_16(new_l3_c);
+ }
+ }
+#endif /* if 0 */
+ /*
+ * declare variable
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE
+ /*
+ * calculate checksum
+ */
+ CNAT_UPDATE_L3_ICMP_CHECKSUM(((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ (db->in2out_key.k.port),
+ (clib_net_to_host_u16(ip->checksum)),
+ (clib_net_to_host_u16(icmp->checksum)),
+ ((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (db->out2in_key.k.port))
+ //set ip header
+ ip->src_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+ //set icmp header
+ icmp->identifier =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+ icmp->checksum =
+ clib_host_to_net_u16(new_l4_c);
+}
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->src_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (icmp->identifier);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_ICMP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_in2out_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_in2out_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->in2out_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ spp_ctx_t *ctx = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ int disposition = CNAT_V4_ICMP_Q_I2O_T;
+ int counter = CNAT_V4_ICMP_Q_I2O_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_icmp_q_inside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_key_t dest_info;
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /* Try to generate ICMP error msg, as TTL is <= 1 */
+
+ if (icmpv4_generate_with_throttling
+ (ctx, ip, ctx->ru.rx.uidb_index)) {
+
+ /* Generated ICMP */
+ disposition = CNAT_V4_ICMP_Q_I2O_T;
+ counter = CNAT_V4_ICMP_Q_I2O_TTL_GEN;
+ } else {
+ /* Could not generated ICMP - drop the packet */
+ disposition = CNAT_V4_ICMP_Q_I2O_D;
+ counter = CNAT_V4_ICMP_Q_I2O_TTL_DROP;
+ }
+ goto drop_pkt;
+ }
+ }
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ db = cnat_main_db + db_index;
+ dest_info.k.port = 0;
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ goto update_pkt;
+ }
+
+ if(PREDICT_FALSE(db->dst_ipv4 != dest_info.k.ipv4)) {
+ if(PREDICT_TRUE(db->nsessions == 1)) {
+ /* Handle one to 2 dest scenarion */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_ICMP_Q_I2O_D;
+ counter = CNAT_V4_ICMP_Q_I2O_NO_SESSION_DROP;
+ goto drop_pkt;
+ }
+ } else if (PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ } else { /* Many translations exist already */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_ICMP_Q_I2O_D;
+ counter = CNAT_V4_ICMP_Q_I2O_NO_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ }
+
+update_pkt:
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT BEFORE\n");
+ print_icmp_pkt(ip);
+ }
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /*
+ * 1. update src ipv4 addr and src icmp identifier
+ * 2. update ipv4 checksum and icmp checksum
+ */
+ swap_ip_src_icmp_id(ip, icmp, db, db->in2out_key.k.vrf);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT AFTER\n");
+ print_icmp_pkt(ip);
+ }
+
+ /*
+ * update db counter, timer
+ */
+
+ if(PREDICT_FALSE(session_db != 0)) {
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ CNAT_DB_TIMEOUT_RST(db);
+ }
+ db->in2out_pkts++;
+ in2out_forwarding_count++;
+
+ } else {
+ disposition = CNAT_V4_ICMP_Q_I2O_E;
+ counter = CNAT_V4_ICMP_Q_I2O_MISS_PKT;
+ }
+
+drop_pkt:
+
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_icmp_q_inside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_icmp_q_inside_input_node) = {
+ .function = cnat_ipv4_icmp_q_inside_input_node_fn,
+ .name = "vcgn-v4-icmp-q-i2o",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_icmp_q_inside_input_error_strings),
+ .error_strings = cnat_ipv4_icmp_q_inside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_ICMP_Q_I2O_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_ICMP_Q_I2O_E] = "vcgn-v4-icmp-q-i2o-e",
+ [CNAT_V4_ICMP_Q_I2O_T] = "ip4-input",
+ [CNAT_V4_ICMP_Q_I2O_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_icmp_q_inside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_icmp_q_inside_input_main_t * mp = &cnat_ipv4_icmp_q_inside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_icmp_q_inside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input_exception.c b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input_exception.c
new file mode 100644
index 00000000000..9b5e280e571
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_inside_input_exception.c
@@ -0,0 +1,235 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_icmp_query_inside_input_exception.c - cnat_ipv4_icmp_query_inside_input_exception node pipeline stage functions
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_icmp.h"
+
+#define foreach_cnat_ipv4_icmp_q_inside_input_exc_error \
+_(CNAT_V4_ICMP_Q_I2O_E_T_PKT, "v4 icmp query i2o-e transmit") \
+_(CNAT_V4_ICMP_Q_I2O_E_G_PKT, "v4 icmp query i2o-e gen icmp msg") \
+_(CNAT_V4_ICMP_Q_I2O_E_D_PKT, "v4 icmp query i2o-e pkt drop") \
+_(CNAT_V4_ICMP_Q_I2O_E_DC_PKT, "v4 icmp query i2o-e drop (no config)") \
+_(CNAT_V4_ICMP_Q_I2O_E_DR_PKT, "v4 icmp query i2o-e drop (not in run state)") \
+_(CNAT_V4_ICMP_Q_I2O_E_DD_PKT, "v4 icmp query i2o-e drop (no direct port)") \
+_(CNAT_V4_ICMP_Q_I2O_E_DA_PKT, "v4 icmp query i2o-e drop (no any port)") \
+_(CNAT_V4_ICMP_Q_I2O_E_DO_PKT, "v4 icmp query i2o-e drop (out of port limit)") \
+_(CNAT_V4_ICMP_Q_I2O_E_DS_PKT, "v4 icmp query i2o_e drop (out of session db)")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_icmp_q_inside_input_exc_error
+#undef _
+ CNAT_IPV4_ICMP_Q_INSIDE_INPUT_EXCEPTIONS_N_ERROR,
+} cnat_ipv4_icmp_q_inside_input_exc_error_t;
+
+
+static char * cnat_ipv4_icmp_q_inside_input_exc_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_icmp_q_inside_input_exc_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_icmp_q_inside_input_exc_main_t;
+
+typedef enum {
+ CNAT_V4_ICMP_Q_E_I2O_T,
+ //CNAT_V4_ICMP_Q_E_I2O_GEN,
+ CNAT_V4_ICMP_Q_E_I2O_D,
+ CNAT_V4_ICMP_Q_E_I2O_NEXT,
+} cnat_ipv4_icmp_q_inside_input_exc_next_t;
+
+#define CNAT_V4_ICMP_Q_E_I2O_GEN CNAT_V4_ICMP_Q_E_I2O_T
+
+cnat_ipv4_icmp_q_inside_input_exc_main_t cnat_ipv4_icmp_q_inside_input_exc_main;
+vlib_node_registration_t cnat_ipv4_icmp_q_inside_input_exc_node;
+
+#define NSTAGES 2
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ int disposition = CNAT_V4_ICMP_Q_E_I2O_T;
+ int counter = CNAT_V4_ICMP_Q_I2O_E_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_icmp_q_inside_input_exc_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ cnat_key_t dest_info;
+ cnat_gen_icmp_info info;
+ cnat_db_key_bucket_t ki;
+ cnat_main_db_entry_t *db = NULL;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ ki.k.k.vrf, CNAT_ICMP)
+
+ ki.k.k.ipv4 =
+ clib_net_to_host_u32(ip->src_addr);
+ ki.k.k.port =
+ clib_net_to_host_u16(icmp->identifier);
+
+ dest_info.k.port = 0;
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ dest_info.k.vrf, CNAT_ICMP)
+
+ db = cnat_get_main_db_entry_v2(&ki, PORT_SINGLE, PORT_TYPE_DYNAMIC,
+ &info, &dest_info);
+ if (PREDICT_TRUE(db != 0)) {
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT BEFORE\n");
+ print_icmp_pkt(ip);
+ }
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /*
+ * step 6 do nat before fwd pkt
+ */
+ swap_ip_src_icmp_id(ip, icmp, db, db->in2out_key.k.vrf);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT AFTER\n");
+ print_icmp_pkt(ip);
+ }
+
+ /*
+ * update db for this pkt
+ */
+ CNAT_DB_UPDATE_IN2OUT_TIMER
+ in2out_forwarding_count++;
+
+ } else {
+ switch (info.error) {
+ case (CNAT_NO_VRF_RUN):
+ counter = CNAT_V4_ICMP_Q_I2O_E_DR_PKT;
+ break;
+ case (CNAT_OUT_LIMIT):
+ counter = CNAT_V4_ICMP_Q_I2O_E_DO_PKT;
+ break;
+ case (CNAT_NO_PORT_ANY):
+ case (CNAT_NO_POOL_ANY):
+ case (CNAT_BAD_INUSE_ANY):
+ case (CNAT_NOT_FOUND_ANY):
+ counter = CNAT_V4_ICMP_Q_I2O_E_DA_PKT;
+ break;
+ case (CNAT_INV_PORT_DIRECT):
+ case (CNAT_DEL_PORT_DIRECT):
+ case (CNAT_BAD_INUSE_DIRECT):
+ case (CNAT_NOT_FOUND_DIRECT):
+ counter = CNAT_V4_ICMP_Q_I2O_E_DD_PKT;
+ break;
+ case (CNAT_ERR_NO_SESSION_DB):
+ counter = CNAT_V4_ICMP_Q_I2O_E_DS_PKT;
+ break;
+ default:
+ counter = CNAT_V4_ICMP_Q_I2O_E_DC_PKT;
+ break;
+ }
+ /*
+ * send to icmp msg generate node
+ */
+ if (info.gen_icmp_msg == CNAT_ICMP_MSG) {
+ #if 0
+ u32 *fd = (u32*)ctx->feature_data;
+ fd[0] = info.svi_addr;
+ fd[1] = CNAT_ICMP_DEST_UNREACHABLE;
+ #endif
+ disposition = CNAT_V4_ICMP_Q_E_I2O_GEN;
+ counter = CNAT_V4_ICMP_Q_I2O_E_G_PKT;
+ } else {
+ disposition = CNAT_V4_ICMP_Q_E_I2O_D;
+ counter = CNAT_V4_ICMP_Q_I2O_E_D_PKT;
+ }
+ DEBUG_I2O_DROP(CNAT_DEBUG_DROP_ICMP)
+ }
+
+ em->counters[node_counter_base_index + counter] += 1;
+
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_icmp_q_inside_input_exc_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+VLIB_REGISTER_NODE (cnat_ipv4_icmp_q_inside_input_exc_node) = {
+ .function = cnat_ipv4_icmp_q_inside_input_exc_node_fn,
+ .name = "vcgn-v4-icmp-q-i2o-e",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_icmp_q_inside_input_exc_error_strings),
+ .error_strings = cnat_ipv4_icmp_q_inside_input_exc_error_strings,
+
+ .n_next_nodes = CNAT_V4_ICMP_Q_E_I2O_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ //[CNAT_V4_ICMP_Q_E_I2O_GEN] = "icmp_msg_gen", /* Currently it will go
+ //to ip4-input node. We have to port icmp msg generator node */
+ [CNAT_V4_ICMP_Q_E_I2O_T] = "ip4-input",
+ [CNAT_V4_ICMP_Q_E_I2O_D] = "error-drop",
+ },
+};
+
+
+clib_error_t *cnat_ipv4_icmp_q_inside_input_exc_init (vlib_main_t *vm)
+{
+ cnat_ipv4_icmp_q_inside_input_exc_main_t * mp = &cnat_ipv4_icmp_q_inside_input_exc_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_icmp_q_inside_input_exc_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_icmp_query_outside_input.c b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_outside_input.c
new file mode 100644
index 00000000000..2c05e0b400e
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_icmp_query_outside_input.c
@@ -0,0 +1,381 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_icmp_query_outside_input.c - cnat_ipv4_icmp_query_outside_input node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_icmp.h"
+
+#define foreach_cnat_ipv4_icmp_q_outside_input_error \
+_(CNAT_V4_ICMP_Q_O2I_T_PKT, "cnat v4 icmp_q o2i packet transmit") \
+_(CNAT_V4_ICMP_Q_O2I_MISS_PKT, "cnat v4 icmp_q o2i drop") \
+_(CNAT_V4_ICMP_Q_O2I_TTL_GEN, "cnat v4 icmp_q o2i ttl generate") \
+_(CNAT_V4_ICMP_Q_O2I_TTL_DROP, "cnat v4 icmp_q o2i ttl drop") \
+_(CNAT_V4_ICMP_Q_O2I_NO_SESSION_DROP, "cnat v4 icmp_q o2i no session drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_icmp_q_outside_input_error
+#undef _
+ CNAT_IPV4_ICMP_Q_OUTSIDE_INPUT_N_ERROR,
+} cnat_ipv4_icmp_q_outside_input_t;
+
+static char * cnat_ipv4_icmp_q_outside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_icmp_q_outside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_icmp_q_outside_input_main_t;
+
+typedef enum {
+ CNAT_V4_ICMP_Q_O2I_T,
+ CNAT_V4_ICMP_Q_O2I_D,
+ CNAT_V4_ICMP_Q_O2I_NEXT,
+} cnat_ipv4_icmp_q_outside_input_next_t;
+
+cnat_ipv4_icmp_q_outside_input_main_t cnat_ipv4_icmp_q_outside_input_main;
+vlib_node_registration_t cnat_ipv4_icmp_q_outside_input_node;
+
+#define NSTAGES 5
+
+inline void swap_ip_dst_icmp_id(ipv4_header *ip,
+ icmp_v4_t *icmp,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+#if 0
+ u32 postmap_ip;
+ u8 direction;
+ u32 old_ip;
+ u32 old_postmap_ip;
+
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 1;
+ if(cnat_static_dest_db_get_translation(ip->src_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+
+ old_ip = spp_net_to_host_byte_order_32(&(ip->src_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)))
+ ip->checksum =
+ spp_host_to_net_byte_order_16(new_l3_c);
+ ip->src_addr = postmap_ip;
+ }
+ }
+#endif /* if 0 */
+ /*
+ * declare variable
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE
+ /*
+ * calculate checksum
+ */
+ CNAT_UPDATE_L3_ICMP_CHECKSUM(((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (db->out2in_key.k.port),
+ (clib_net_to_host_u16(ip->checksum)),
+ (clib_net_to_host_u16(icmp->checksum)),
+ ((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ (db->in2out_key.k.port))
+ //set ip header
+ ip->dest_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+ //set icmp header
+ icmp->identifier =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ icmp->checksum =
+ clib_host_to_net_u16(new_l4_c);
+}
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->dest_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (icmp->identifier);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_ICMP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_out2in_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_out2in_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->out2in_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->out2in_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ int disposition = CNAT_V4_ICMP_Q_O2I_T;
+ int counter = CNAT_V4_ICMP_Q_O2I_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ icmp_v4_t *icmp = (icmp_v4_t *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_icmp_q_outside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_key_t dest_info;
+ cnat_vrfmap_t * vrf_map_p __attribute__((unused)) = NULL;
+ u32 vrf_index __attribute__((unused)) = 0;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ db = cnat_main_db + db_index;
+ dest_info.k.port = 0;
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT BEFORE\n");
+ print_icmp_pkt(ip);
+ }
+
+ vrf_map_p = cnat_map_by_vrf + db->vrfmap_index;
+ vrf_index = (db->in2out_key.k.vrf & CNAT_VRF_MASK);
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ goto update_pkt;
+ }
+
+ if(PREDICT_FALSE(db->dst_ipv4 != dest_info.k.ipv4)) {
+
+ if(PREDICT_TRUE(db->nsessions == 1)) {
+ /* Handle one to 2 dest scenarion */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_ICMP_Q_O2I_D;
+ counter = CNAT_V4_ICMP_Q_O2I_NO_SESSION_DROP;
+ goto drop_pkt;
+ }
+ } else if (PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ } else { /* Many translations exist already */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_ICMP_Q_O2I_D;
+ counter = CNAT_V4_ICMP_Q_O2I_NO_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ }
+
+update_pkt:
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /*
+ * 1. update dest ipv4 addr and icmp id
+ * 2. update ipv4 checksum and icmp checksum
+ */
+ swap_ip_dst_icmp_id(ip, icmp, db, db->in2out_key.k.vrf);
+
+ if (PREDICT_FALSE(icmp_debug_flag)) {
+ printf("\nDUMPING ICMP PKT AFTER\n");
+ print_icmp_pkt(ip);
+ }
+
+ db->out2in_pkts++;
+
+ //nat44_dslite_global_stats[dslite_flag].out2in_forwarding_count++;
+
+ } else {
+ disposition = CNAT_V4_ICMP_Q_O2I_D;
+ counter = CNAT_V4_ICMP_Q_O2I_MISS_PKT;
+ }
+
+drop_pkt:
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_icmp_q_outside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_icmp_q_outside_input_node) = {
+ .function = cnat_ipv4_icmp_q_outside_input_node_fn,
+ .name = "vcgn-v4-icmp-q-o2i",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_icmp_q_outside_input_error_strings),
+ .error_strings = cnat_ipv4_icmp_q_outside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_ICMP_Q_O2I_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_ICMP_Q_O2I_T] = "ip4-input",
+ [CNAT_V4_ICMP_Q_O2I_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_icmp_q_outside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_icmp_q_outside_input_main_t * mp = &cnat_ipv4_icmp_q_outside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_icmp_q_outside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input.c b/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input.c
new file mode 100644
index 00000000000..5bea707385c
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input.c
@@ -0,0 +1,424 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_tcp_inside_input.c - cnat_ipv4_tcp_inside_input node pipeline
+ * stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_db.h"
+#include "tcp_header_definitions.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_v4_functions.h"
+
+#define foreach_cnat_ipv4_tcp_inside_input_error \
+_(CNAT_V4_TCP_I2O_PKT_IN, "tcp i2o packets received") \
+_(CNAT_V4_TCP_I2O_PKT_T, "tcp i2o packets natted") \
+_(CNAT_V4_TCP_I2O_EXCEPTION, "packets to tcp i2o exception") \
+_(CNAT_V4_TCP_I2O_TTL_GEN, "generated TTL expiry ICMP packets") \
+_(CNAT_V4_TCP_I2O_TTL_GEN_DROP, "could not generate TTL expiry ICMP packets") \
+_(CNAT_V4_TCP_I2O_SESSION_DROP, "could not generate session") \
+_(CNAT_V4_UDP_I2O_FRAG_DROP, "non-first fragment drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_tcp_inside_input_error
+#undef _
+ CNAT_IPV4_TCP_INSIDE_INPUT_N_ERROR,
+} cnat_ipv4_tcp_inside_input_t;
+
+static char * cnat_ipv4_tcp_inside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_tcp_inside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_tcp_inside_input_main_t;
+
+typedef enum {
+ CNAT_V4_TCP_I2O_E,
+ CNAT_V4_TCP_I2O_T,
+ CNAT_V4_TCP_I2O_D,
+ CNAT_V4_TCP_I2O_NEXT,
+} cnat_ipv4_tcp_inside_input_next_t;
+
+#define CNAT_REWRITE_OUTPUT CNAT_V4_TCP_I2O_T
+#define CNAT_V4_ICMP_GEN CNAT_V4_TCP_I2O_D
+
+//#define CNAT_V4_TCP_I2O_E CNAT_V4_TCP_I2O_D //remove it once exception node is created
+cnat_ipv4_tcp_inside_input_main_t cnat_ipv4_tcp_inside_input_main;
+vlib_node_registration_t cnat_ipv4_tcp_inside_input_node;
+
+#define NSTAGES 6
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+ //cnat_feature_data_t *fd = (cnat_feature_data_t *)ctx->feature_data;
+
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp_hdr_type *tcp = (tcp_hdr_type *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->src_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (tcp->src_port);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_TCP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_in2out_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{ /* nothing */ }
+
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_in2out_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+static inline void
+stage4(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->in2out_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ spp_ctx_t *ctx = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ int disposition = CNAT_V4_TCP_I2O_T;
+ int counter = CNAT_V4_TCP_I2O_PKT_T;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp_hdr_type *tcp = (tcp_hdr_type *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_tcp_inside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_key_t dest_info;
+ u32 window;
+ u8 scale;
+
+
+ INCREMENT_NODE_COUNTER(CNAT_V4_TCP_I2O_PKT_IN);
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /* Try to generate ICMP error msg, as TTL is <= 1 */
+
+ if (icmpv4_generate_with_throttling
+ (ctx, ip, ctx->ru.rx.uidb_index)) {
+
+ /* Generated ICMP */
+ disposition = CNAT_REWRITE_OUTPUT;
+ counter = CNAT_V4_TCP_I2O_TTL_GEN;
+ } else {
+ /* Could not generated ICMP - drop the packet */
+ disposition = CNAT_V4_TCP_I2O_D;
+ counter = CNAT_V4_TCP_I2O_TTL_GEN_DROP;
+ }
+ goto drop_pkt;
+ }
+ }
+
+ if (PREDICT_FALSE(db_index == EMPTY)) {
+ /* Deleted fragment code from here */
+ disposition = CNAT_V4_TCP_I2O_E;
+ counter = CNAT_V4_TCP_I2O_EXCEPTION;
+ } else {
+ db = cnat_main_db + db_index;
+
+ /* Handle destination sessions */
+ dest_info.k.port = clib_net_to_host_u16(tcp->dest_port);
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ goto update_pkt;
+ }
+
+ if(PREDICT_FALSE(db->dst_ipv4 != dest_info.k.ipv4 ||
+ db->dst_port != dest_info.k.port)) {
+ if(PREDICT_TRUE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ } else if(PREDICT_FALSE(db->nsessions == 1)) {
+ /* Destn is not same as in main db. Multiple session
+ * scenario
+ */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_TCP_I2O_D;
+ counter = CNAT_V4_TCP_I2O_SESSION_DROP;
+ goto drop_pkt;
+ }
+ } else { /* There are already multiple destinations */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_TCP_I2O_D;
+ counter = CNAT_V4_TCP_I2O_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ if(PREDICT_TRUE(session_db != 0)) {
+ /* Have to repeat the window size check for new destinations */
+ window = (u32)clib_net_to_host_u16(tcp->window_size);
+ window = window << session_db->scale;
+ if(PREDICT_TRUE(!session_db->window)) {
+ calculate_window_scale(tcp, &scale);
+ session_db->scale = scale;
+ session_db->window = window;
+ } else if (PREDICT_FALSE(session_db->window <
+ window)) {
+ /* Update the db entry with window option from packet */
+ session_db->window = window;
+ } else {
+ /* Do nothing */
+ }
+ session_db->tcp_seq_num = clib_net_to_host_u32(tcp->seq_num);
+ session_db->ack_no = clib_net_to_host_u32(tcp->ack_num);
+#if DEBUG > 1
+ printf("\n In2out SDB stages seq no = %u,"
+ " ack no = %u, window = %u\n",
+ session_db->tcp_seq_num,
+ session_db->ack_no,
+ session_db->window);
+#endif
+
+ }
+ } else {
+ //Update the seq no and ack no for subsequent communication
+ //after connection establishment
+ //No need to update window here. Window is already updated
+ //during connection establishment
+ window = (u32)clib_net_to_host_u16(tcp->window_size);
+ window = window << db->scale;
+ if(PREDICT_FALSE(!ALG_ENABLED_DB(db))) {
+ //This check is done since proto_data is part of union in main
+ //db entry
+ db->proto_data.tcp_seq_chk.seq_no =
+ clib_net_to_host_u32(tcp->seq_num);
+ db->proto_data.tcp_seq_chk.ack_no =
+ clib_net_to_host_u32(tcp->ack_num);
+ }
+ if (PREDICT_FALSE(db->diff_window < window)) {
+ /* Update the db entry with window option from packet */
+ db->diff_window = window;
+ }
+#if DEBUG > 1
+ printf("\n In2out MainDB seq no = %u,"
+ "\n ack no = %u\n",
+ db->proto_data.tcp_seq_chk.seq_no,
+ db->proto_data.tcp_seq_chk.ack_no);
+ printf("\n In2out MAINDB window = %u\n",
+ db->diff_window);
+#endif
+ }
+update_pkt:
+
+ counter = CNAT_V4_TCP_I2O_PKT_T;
+ disposition = CNAT_V4_TCP_I2O_T;
+
+ /* NO FRAGMENT & ALG HANDLING. DELETING THE CODE */
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ tcp_in2out_nat_mss_n_checksum(ip,
+ tcp,
+ db->out2in_key.k.ipv4,
+ db->out2in_key.k.port,
+ db
+ /*, db->in2out_key.k.vrf */);
+
+ /* update transaltion counters */
+ db->in2out_pkts++;
+ in2out_forwarding_count++;
+
+ /* update the timer for good mode, or evil mode dst_ip match */
+
+ if(PREDICT_FALSE(session_db != NULL)) {
+ V4_TCP_UPDATE_SESSION_DB_FLAG(session_db, tcp);
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ V4_TCP_UPDATE_SESSION_FLAG(db, tcp);
+ CNAT_DB_TIMEOUT_RST(db);
+ }
+ }
+
+drop_pkt:
+
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_tcp_inside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_tcp_inside_input_node) = {
+ .function = cnat_ipv4_tcp_inside_input_node_fn,
+ .name = "vcgn-v4-tcp-i2o",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_tcp_inside_input_error_strings),
+ .error_strings = cnat_ipv4_tcp_inside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_TCP_I2O_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_TCP_I2O_E] = "vcgn-v4-tcp-i2o-e",
+ [CNAT_V4_TCP_I2O_T] = "ip4-input",
+ [CNAT_V4_TCP_I2O_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_tcp_inside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_tcp_inside_input_main_t * mp = &cnat_ipv4_tcp_inside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_tcp_inside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input_exceptions.c b/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input_exceptions.c
new file mode 100644
index 00000000000..bc1bebb04ba
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_tcp_inside_input_exceptions.c
@@ -0,0 +1,314 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_tcp_inside_input_exceptions.c -
+ * cnat_ipv4_tcp_inside_input_exceptions node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_db.h"
+#include "tcp_header_definitions.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_v4_functions.h"
+
+
+#define foreach_cnat_ipv4_tcp_inside_input_exc_error \
+_(CNAT_V4_TCP_I2O_E_T_PKT, "v4 tcp i2o-e transmit natted pkt") \
+_(CNAT_V4_TCP_I2O_E_D_NON_SYN_PKT, "v4 tcp i2o-e non syn drop") \
+_(CNAT_V4_TCP_I2O_E_D_INVALID_PKT, "v4 tcp i2o-e invalid pkt drop") \
+_(CNAT_V4_TCP_I2O_E_DROP, "v4 tcp i2o-e drop") \
+_(CNAT_V4_TCP_I2O_E_GEN_ICMP, "v4 tcp i2o-e gen icmp msg") \
+_(CNAT_V4_TCP_I2O_E_D_NO_SESSION, "v4 tcp i2o-e no session db entry drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_tcp_inside_input_exc_error
+#undef _
+ CNAT_IPV4_TCP_INSIDE_INPUT_EXCEPTIONS_N_ERROR,
+} cnat_ipv4_tcp_inside_input_exc_error_t;
+
+
+static char * cnat_ipv4_tcp_inside_input_exc_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_tcp_inside_input_exc_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_tcp_inside_input_exc_main_t;
+
+typedef enum {
+ CNAT_V4_TCP_I2O_E_T,
+ //CNAT_V4_TCP_I2O_E_ICMP,
+ CNAT_V4_TCP_I2O_E_D,
+ CNAT_V4_TCP_I2O_E_NEXT,
+} cnat_ipv4_udp_inside_input_exc_next_t;
+
+#define CNAT_V4_TCP_I2O_E_ICMP CNAT_V4_TCP_I2O_E_D
+
+cnat_ipv4_tcp_inside_input_exc_main_t cnat_ipv4_tcp_inside_input_exc_main;
+vlib_node_registration_t cnat_ipv4_tcp_inside_input_exc_node;
+
+#define NSTAGES 2
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ vlib_node_t *n =
+ vlib_get_node (vm, cnat_ipv4_tcp_inside_input_exc_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ cnat_gen_icmp_info info;
+ cnat_db_key_bucket_t ki;
+ cnat_main_db_entry_t *db = NULL;
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp_hdr_type *tcp = (tcp_hdr_type *)((u8*)ip + ipv4_hdr_len);
+ int disposition = CNAT_V4_TCP_I2O_E_T;
+ int counter = CNAT_V4_TCP_I2O_E_T_PKT;
+ cnat_key_t dest_info;
+ u32 window;
+ u8 scale;
+
+ window = (u32)clib_net_to_host_u16(tcp->window_size);
+ calculate_window_scale(tcp, &scale);
+
+ dest_info.k.port = clib_net_to_host_u16(tcp->dest_port);
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ dest_info.k.vrf, CNAT_TCP)
+
+ /* for TCP if not SYN or if src_port is 0, silently drop the packet */
+ if (PREDICT_FALSE(!((tcp->flags & TCP_FLAG_SYN) && (tcp->src_port)))) {
+
+ /*
+ * If the packet is dropped due to both reasons,
+ * count it as invalid packet drop
+ */
+ if (!tcp->src_port) {
+ counter = CNAT_V4_TCP_I2O_E_D_INVALID_PKT;
+ } else {
+ counter = CNAT_V4_TCP_I2O_E_D_NON_SYN_PKT;
+ }
+ disposition = CNAT_V4_TCP_I2O_E_D;
+ goto in2out_e;
+ }
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ ki.k.k.vrf, CNAT_TCP)
+
+ ki.k.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+ ki.k.k.port = clib_net_to_host_u16(tcp->src_port);
+
+ db = cnat_get_main_db_entry_v2(&ki, PORT_SINGLE, PORT_TYPE_DYNAMIC, &info,
+ &dest_info);
+
+
+#if DEBUG > 1
+ if(PREDICT_TRUE(db)) {
+ printf("create db %x ip %x->%x port %x->%x dst_ip %x\n", db,
+ db->in2out_key.k.ipv4, db->out2in_key.k.ipv4,
+ db->in2out_key.k.port, db->out2in_key.k.port, db->dst_ipv4);
+ }
+#endif
+
+
+ if (PREDICT_FALSE(db == 0)) {
+ /* failed to create new db entry due to either no more port, or user limit reached,
+ * need to generate ICMP type=3,code=13 msg here,
+ */
+
+ /*
+ * we rate limit the icmp msg per private user,
+ * so we don't flood a user with icmp msg
+ * in case the per user port limit reached
+ */
+ if (PREDICT_TRUE(info.gen_icmp_msg == CNAT_ICMP_MSG)) {
+ /* KEEPING THINGS COMMENTED HERE..MAY NEED TO REVISIT AGAIN */
+ #if 0
+ u32 *fd = (u32*)ctx->feature_data;
+ fd[0] = info.svi_addr;
+ fd[1] = CNAT_ICMP_DEST_UNREACHABLE;
+
+ /*
+ * Let's reverse the direction from i2o to o2i.
+ * This will help using the correct VRF in the fib lookup (AVSM)
+ * especially for the o2i_vrf_override case
+ */
+ ctx->ru.rx.direction = 0; // 0 - o2i, 1 - i2o
+ #endif
+ disposition = CNAT_V4_TCP_I2O_E_ICMP;
+ counter = CNAT_V4_TCP_I2O_E_GEN_ICMP;
+
+ } else {
+ disposition = CNAT_V4_TCP_I2O_E_D;
+ counter = CNAT_V4_TCP_I2O_E_DROP;
+ }
+ //DEBUG_I2O_DROP(CNAT_DEBUG_DROP_TCP)
+ } else {
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /* NAT the packet and fix checksum */
+
+ tcp_in2out_nat_mss_n_checksum(ip,
+ tcp,
+ db->out2in_key.k.ipv4,
+ db->out2in_key.k.port,
+ db
+ /*, db->in2out_key.k.vrf */);
+
+ /* this must be inside to outside SYN, do mss here */
+
+ /* update translation counters */
+ db->in2out_pkts++;
+
+ /* set keepalive timer */
+
+ if(PREDICT_TRUE((dest_info.k.ipv4 == db->dst_ipv4) &&
+ (dest_info.k.port == db->dst_port))) {
+ if(PREDICT_FALSE(!ALG_ENABLED_DB(db))) {
+ //This check is done since proto_data is part of union in main
+ //db entry
+
+ db->proto_data.tcp_seq_chk.seq_no =
+ clib_net_to_host_u32(tcp->seq_num);
+ db->proto_data.tcp_seq_chk.ack_no =
+ clib_net_to_host_u32(tcp->ack_num);
+ db->scale = scale;
+ db->diff_window = window;
+ }
+#if DEBUG > 1
+ PLATFORM_DEBUG_PRINT("\nMain DB seq no = %u,"
+ "ack no = %u, window = %u,"
+ "scale = %u",
+ db->proto_data.tcp_seq_chk.seq_no,
+ db->proto_data.tcp_seq_chk.ack_no,
+ db->diff_window
+ db->scale);
+#endif
+ V4_TCP_UPDATE_SESSION_FLAG(db, tcp);
+ /* Check timeout db if there is config for this */
+ (void) query_and_update_db_timeout((void *)db, MAIN_DB_TYPE);
+ db->entry_expires = cnat_current_time;
+ } else {
+ /* Got to find out the session entry corresponding to this..*/
+ cnat_session_entry_t *sdb;
+ sdb = cnat_session_db_lookup_entry(
+ &dest_info, db - cnat_main_db);
+ if(PREDICT_FALSE(sdb == NULL)) {
+ disposition = CNAT_V4_TCP_I2O_E_D;
+ counter = CNAT_V4_TCP_I2O_E_D_NO_SESSION;
+ goto in2out_e;
+ }
+ sdb->tcp_seq_num = clib_net_to_host_u32(tcp->seq_num);
+ sdb->ack_no = clib_net_to_host_u32(tcp->ack_num);
+ sdb->scale = scale;
+ sdb->window = window;
+
+#if DEBUG > 1
+ PLATFORM_DEBUG_PRINT("\nSDB seq no = %u, ack no = %u, window = %u"
+ "\nSDB scale = %u" ,
+ sdb->tcp_seq_num,
+ sdb->ack_no,
+ sdb->window,
+ sdb->scale);
+#endif
+ V4_TCP_UPDATE_SESSION_DB_FLAG(sdb, tcp);
+ /* Check timeout db if there is config for this */
+ (void) query_and_update_db_timeout((void *)sdb, SESSION_DB_TYPE);
+ sdb->entry_expires = cnat_current_time;
+ }
+
+ //PLATFORM_CNAT_SET_TX_VRF(ctx,db->out2in_key.k.vrf)
+
+ counter = CNAT_V4_TCP_I2O_E_T_PKT;
+ in2out_forwarding_count++;
+ }
+
+in2out_e:
+
+ em->counters[node_counter_base_index + counter] += 1;
+
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_tcp_inside_input_exc_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+VLIB_REGISTER_NODE (cnat_ipv4_tcp_inside_input_exc_node) = {
+ .function = cnat_ipv4_tcp_inside_input_exc_node_fn,
+ .name = "vcgn-v4-tcp-i2o-e",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_tcp_inside_input_exc_error_strings),
+ .error_strings = cnat_ipv4_tcp_inside_input_exc_error_strings,
+
+ .n_next_nodes = CNAT_V4_TCP_I2O_E_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_TCP_I2O_E_T] = "ip4-input",
+ [CNAT_V4_TCP_I2O_E_D] = "error-drop",
+ },
+};
+
+
+clib_error_t *cnat_ipv4_tcp_inside_input_exc_init (vlib_main_t *vm)
+{
+ cnat_ipv4_tcp_inside_input_exc_main_t * mp = &cnat_ipv4_tcp_inside_input_exc_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_tcp_inside_input_exc_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_tcp_outside_input.c b/vnet/vnet/vcgn/cnat_ipv4_tcp_outside_input.c
new file mode 100644
index 00000000000..bcf132b1dd7
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_tcp_outside_input.c
@@ -0,0 +1,382 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_tcp_outside_input.c - cnat_v4_tcp_out2in node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_db.h"
+#include "tcp_header_definitions.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_ipv4_udp.h"
+#include "cnat_v4_functions.h"
+
+
+#define foreach_cnat_ipv4_tcp_outside_input_error \
+_(CNAT_V4_TCP_O2I_R_PKT, "v4 tcp o2i pkt received") \
+_(CNAT_V4_TCP_O2I_T_PKT, "v4 tcp o2i pkt natted & transmitted") \
+_(CNAT_V4_TCP_O2I_LOOKUP_FAILED, "v4 tcp o2i lookup failed") \
+_(CNAT_V4_TCP_O2I_TTL_GEN, "v4 tcp o2i generated TTL Expiry ICMP packet") \
+_(CNAT_V4_TCP_O2I_TTL_DROP, "v4 tcp o2i drop due to failure in creating TTL expiry ICMP msg") \
+_(CNAT_V4_TCP_O2I_PTB_GEN, "v4 tcp o2i PTB ICMP pkt generation") \
+_(CNAT_V4_UDP_O2I_PTB_DROP, "v4 tcp o2i drop due to failure in creating PTB ICMP pkt") \
+_(CNAT_V4_TCP_O2I_SESSION_DROP, "v4 tcp o2i drop due to failure in creating session db") \
+_(CNAT_V4_TCP_O2I_SEQ_MISMATCH_DROP, "v4 tcp o2i drop due to TCP sequence mismatch") \
+_(CNAT_V4_TCP_O2I_FILTER_DROP, "v4 tcp o2i drop due to endpoint filtering") \
+_(CNAT_V4_TCP_O2I_NON_SYN_RST_DROP, "v4 tcp o2i drop due no syn/rst flag") \
+_(CNAT_V4_TCP_O2I_FIRST_FRAG_DROP, "v4 tcp o2i first fragment drop") \
+_(CNAT_V4_TCP_O2I_SUB_FRAG_NO_DB_DROP, "v4 tcp o2i subsequest frag no DB drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_tcp_outside_input_error
+#undef _
+ CNAT_IPV4_TCP_OUTSIDE_INPUT_N_ERROR,
+} cnat_ipv4_tcp_outside_input_t;
+
+static char * cnat_ipv4_tcp_outside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_tcp_outside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_tcp_outside_input_main_t;
+
+typedef enum {
+ //CNAT_V4_TCP_O2I_E,
+ CNAT_V4_TCP_O2I_T,
+ CNAT_V4_TCP_O2I_D,
+ CNAT_V4_TCP_O2I_NEXT,
+} cnat_ipv4_tcp_outside_input_next_t;
+
+cnat_ipv4_tcp_outside_input_main_t cnat_ipv4_tcp_outside_input_main;
+vlib_node_registration_t cnat_ipv4_tcp_outside_input_node;
+
+#define NSTAGES 6
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp_hdr_type *tcp = (tcp_hdr_type *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->dest_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (tcp->dest_port);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_TCP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_out2in_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{ /* nothing */ }
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_out2in_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, STORE);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+ }
+}
+
+static inline void
+stage4(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->out2in_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->out2in_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ spp_ctx_t *ctx = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ int disposition = CNAT_V4_TCP_O2I_T;
+ int counter = CNAT_V4_TCP_O2I_T_PKT;
+
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp_hdr_type *tcp = (tcp_hdr_type *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_tcp_outside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_key_t dest_info;
+
+ INCREMENT_NODE_COUNTER(CNAT_V4_TCP_O2I_R_PKT);
+
+ if (PREDICT_FALSE(db_index == EMPTY)) {
+ nat44_dslite_common_stats[0].no_translation_entry_drops ++;
+ counter = CNAT_V4_TCP_O2I_LOOKUP_FAILED;
+ disposition = CNAT_V4_TCP_O2I_D;
+ } else {
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /* Try to generate ICMP error msg, as TTL is <= 1 */
+ if (icmpv4_generate_with_throttling(ctx,
+ ip, ctx->ru.rx.uidb_index)) {
+ /* Generated ICMP */
+ disposition = CNAT_V4_TCP_O2I_T_PKT; //CNAT_REWRITE_OUTPUT;
+ counter = CNAT_V4_TCP_O2I_TTL_GEN;
+ } else {
+ /* Could not generated ICMP - drop the packet */
+ disposition = CNAT_V4_TCP_O2I_D;
+ counter = CNAT_V4_TCP_O2I_TTL_DROP;
+ }
+ goto drop_pkt;
+ }
+ }
+ db = cnat_main_db + db_index;
+#if 0
+ window = db->diff_window;
+ stored_seq_no = db->proto_data.tcp_seq_chk.seq_no;
+ stored_ack_no = db->proto_data.tcp_seq_chk.ack_no;
+ vrf_map_p = cnat_map_by_vrf + db->vrfmap_index;
+ vrf_index = (db->in2out_key.k.vrf & CNAT_VRF_MASK);
+#endif
+ /* For Out2In packet, the dest info is src address and port */
+ dest_info.k.port = clib_net_to_host_u16(tcp->src_port);
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ goto update_pkt;
+ }
+
+
+ if(PREDICT_FALSE(db->dst_ipv4 != dest_info.k.ipv4 ||
+ db->dst_port != dest_info.k.port)) {
+
+ if(PREDICT_TRUE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ //goto packet_upd;
+ } else if(PREDICT_FALSE(db->nsessions == 1)) {
+ /* Destn is not same as in main db. Multiple session
+ * scenario
+ */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_TCP_O2I_D;
+ counter = CNAT_V4_TCP_O2I_SESSION_DROP;
+ goto drop_pkt;
+ }
+ } else { /* There are already multiple destinations */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info, db, TRUE);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_TCP_O2I_D;
+ counter = CNAT_V4_TCP_O2I_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ /* useful for ALG only */
+ #if 0
+ if(PREDICT_TRUE(session_db)) {
+ stored_seq_no = session_db->tcp_seq_num;
+ stored_ack_no = session_db->ack_no;
+ window = session_db->window;
+ }
+ #endif
+ }
+
+
+update_pkt:
+
+ counter = CNAT_V4_TCP_O2I_T_PKT;
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /* update ip checksum, newchecksum = ~(~oldchecksum + ~old + new) */
+ cnat_v4_recalculate_tcp_checksum(ip, tcp,
+ &(ip->dest_addr),
+ &(tcp->dest_port),
+ db->in2out_key.k.ipv4,
+ db->in2out_key.k.port);
+
+ /* CNAT_PPTP_ALG_SUPPORT */
+ db->out2in_pkts++;
+
+ nat44_dslite_global_stats[0].out2in_forwarding_count++;;
+
+ V4_TCP_UPDATE_SESSION_FLAG(db, tcp);
+
+
+ if(PREDICT_FALSE(session_db != NULL)) {
+ V4_TCP_UPDATE_SESSION_DB_FLAG(session_db, tcp);
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ V4_TCP_UPDATE_SESSION_FLAG(db, tcp);
+ CNAT_DB_TIMEOUT_RST(db);
+ }
+
+ }
+
+drop_pkt:
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_tcp_outside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_tcp_outside_input_node) = {
+ .function = cnat_ipv4_tcp_outside_input_node_fn,
+ .name = "vcgn-v4-tcp-o2i",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_tcp_outside_input_error_strings),
+ .error_strings = cnat_ipv4_tcp_outside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_TCP_O2I_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ //[CNAT_V4_TCP_O2I_E] = "vcgn-v4-tcp-o2i-e",
+ [CNAT_V4_TCP_O2I_T] = "ip4-input",
+ [CNAT_V4_TCP_O2I_D] = "error-drop",
+ },
+};
+
+clib_error_t *cnat_ipv4_tcp_outside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_tcp_outside_input_main_t * mp = &cnat_ipv4_tcp_outside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_tcp_outside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_udp.h b/vnet/vnet/vcgn/cnat_ipv4_udp.h
new file mode 100644
index 00000000000..f6c5b5e0133
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_udp.h
@@ -0,0 +1,41 @@
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * Filename: cnat_ipv4_udp.h
+ *
+ * Description: common functions for udp node
+ *
+ * Assumptions and Constraints:
+ *
+ * Copyright (c) 2000-2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-----------------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_IPV4_UDP_H__
+#define __CNAT_IPV4_UDP_H__
+
+#include "tcp_header_definitions.h"
+#include "cnat_db.h"
+#include "cnat_v4_functions.h"
+#include "cnat_global.h"
+#include "cnat_config.h"
+
+inline void swap_ip_src_udp_port(ipv4_header *ip,
+ udp_hdr_type_t *udp,
+ cnat_main_db_entry_t *db);
+inline void swap_ip_dst_udp_port(ipv4_header *ip,
+ udp_hdr_type_t *udp,
+ cnat_main_db_entry_t *db,
+ u16 vrf);
+#endif /* __CNAT_IPV4_UDP_H__ */
diff --git a/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input.c b/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input.c
new file mode 100644
index 00000000000..657c5f1e64e
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input.c
@@ -0,0 +1,508 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_udp_inside_input.c - cnat_ipv4_udp_inside_input node functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_global.h"
+#include "cnat_db.h"
+#include "cnat_ipv4_udp.h"
+#include "cnat_pcp_server.h"
+
+
+#define foreach_cnat_ipv4_udp_inside_input_error \
+_(CNAT_V4_UDP_I2O_T_PKT, "v4 udp i2o transmit") \
+_(CNAT_V4_UDP_I2O_MISS_PKT, "v4 udp i2o db miss") \
+_(CNAT_V4_UDP_I2O_TTL_GEN, "v4 udp i2o TTL gen") \
+_(CNAT_V4_UDP_I2O_TTL_DROP, "v4 udp i2o TTL drop") \
+_(CNAT_V4_PCP_PKT, "v4 pcp pkt") \
+_(CNAT_V4_UDP_I2O_SESSION_DROP, "v4 udp i2o session drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_udp_inside_input_error
+#undef _
+ CNAT_IPV4_UDP_INSIDE_INPUT_N_ERROR,
+} cnat_ipv4_udp_inside_input_t;
+
+static char * cnat_ipv4_udp_inside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_udp_inside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_udp_inside_input_main_t;
+
+typedef enum {
+ CNAT_V4_I2O_FIXME,
+ CNAT_V4_UDP_I2O_E,
+ CNAT_REWRITE_OUTPUT,
+ CNAT_V4_UDP_I2O_T = CNAT_REWRITE_OUTPUT,
+ CNAT_N_NEXT,
+} cnat_ipv4_udp_inside_input_next_t;
+
+#define CNAT_V4_UDP_I2O_D CNAT_V4_I2O_FIXME
+#define CNAT_V4_PCP_T CNAT_V4_I2O_FIXME
+
+cnat_ipv4_udp_inside_input_main_t cnat_ipv4_udp_inside_input_main;
+vlib_node_registration_t cnat_ipv4_udp_inside_input_node;
+
+#define NSTAGES 6
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+#ifndef TOBE_PORTED
+static inline u32
+is_pcp_pkt(u32 addr, u16 port)
+{
+ return CNAT_NO_CONFIG;
+}
+#else
+static inline u32
+is_pcp_pkt(spp_ctx_t *ctx, u32 addr, u16 port)
+{
+ cnat_vrfmap_t *my_vrfmap = NULL;
+ u16 my_vrfmap_index;
+
+ my_vrfmap_index = vrf_map_array[ctx->ru.rx.uidb_index];
+
+ if (PREDICT_TRUE(my_vrfmap_index != VRF_MAP_ENTRY_EMPTY)) {
+
+ my_vrfmap = cnat_map_by_vrf + my_vrfmap_index;
+
+ if (PREDICT_FALSE( port == my_vrfmap->pcp_server_port)) {
+ if(PREDICT_TRUE(addr == my_vrfmap->pcp_server_addr)) {
+ return CNAT_SUCCESS;
+ }
+ }
+ }
+
+ return CNAT_NO_CONFIG;
+}
+#endif
+
+inline void swap_ip_src_udp_port(ipv4_header *ip,
+ udp_hdr_type_t *udp,
+ cnat_main_db_entry_t *db)
+{
+ /*
+ * declare varibale
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE
+ /*
+ * calculate checksum
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM(((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ (db->in2out_key.k.port),
+ (clib_net_to_host_u16(ip->checksum)),
+ (clib_net_to_host_u16(udp->udp_checksum)),
+ ((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (db->out2in_key.k.port))
+
+/* #define UDP_PACKET_DEBUG 1 */
+
+// Temporary debugs which will be suppressed later
+#ifdef UDP_PACKET_DEBUG
+ if (PREDICT_FALSE(udp_inside_packet_dump_enable)) {
+ printf("\nIn2Out UDP packet before translation");
+ print_udp_pkt(ip);
+ }
+#endif
+
+ //set ip header
+ ip->src_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+ u16 frag_offset =
+ clib_net_to_host_u16(ip->frag_flags_offset);
+
+ if(PREDICT_FALSE(frag_offset & IP_FRAG_OFFSET_MASK)) {
+ return; /* No need to update UDP fields */
+ }
+ //set udp header
+ udp->src_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+ /*
+ * No easy way to avoid this if check except by using
+ * complex logic - may not be worth it.
+ */
+ if (PREDICT_TRUE(udp->udp_checksum)) {
+ udp->udp_checksum =
+ clib_host_to_net_u16(new_l4_c);
+ }
+
+// Temporary debugs which will be suppressed later
+#ifdef UDP_PACKET_DEBUG
+ if (PREDICT_FALSE(udp_inside_checksum_disable)) {
+ printf("\nIn2Out UDP checksum 0x%x disabled by force", new_l4_c);
+ udp->udp_checksum = 0;
+ }
+ if (PREDICT_FALSE(udp_inside_packet_dump_enable)) {
+ printf("\nIn2Out UDP packet after translation");
+ print_udp_pkt(ip);
+ }
+#endif
+}
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->src_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (udp->src_port);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_UDP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_in2out_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{ /* nothing */ }
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_in2out_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, LOAD);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+ }
+}
+
+static inline void
+stage4(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->in2out_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+static u64 pkt_num = 0;
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ spp_ctx_t *ctx = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ int disposition = CNAT_V4_UDP_I2O_T;
+ int counter = CNAT_V4_UDP_I2O_T_PKT;
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_udp_inside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_key_t dest_info;
+
+ pkt_num++;
+
+ if(PREDICT_FALSE(is_pcp_pkt(ip->dest_addr, udp->dest_port) ==
+ CNAT_SUCCESS))
+ {
+ PCP_INCR(input);
+ disposition = CNAT_V4_PCP_T;
+ counter = CNAT_V4_PCP_PKT;
+
+ goto pcp_pkt;
+ }
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /* Try to generate ICMP error msg, as TTL is <= 1 */
+
+ if (icmpv4_generate_with_throttling
+ (ctx, ip, ctx->ru.rx.uidb_index)) {
+ /* Generated ICMP */
+ disposition = CNAT_REWRITE_OUTPUT;
+ counter = CNAT_V4_UDP_I2O_TTL_GEN;
+ } else {
+ /* Could not generated ICMP - drop the packet */
+ disposition = CNAT_V4_UDP_I2O_D;
+ counter = CNAT_V4_UDP_I2O_TTL_DROP;
+ }
+ goto drop_pkt;
+ }
+ }
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ cnat_main_db_entry_t *db = cnat_main_db + db_index;
+
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+
+ /* MUST revisit: it seems farg is set to 1 for few packets & because of
+ * this the port is not updated & it becomes 0. Commenting teporarily
+ * this fargment check & setting dst port with udp dst port value */
+ dest_info.k.port = clib_net_to_host_u16(udp->dest_port);
+ #if 0 // DONOT REMOVE THIS if 0
+ if(PREDICT_FALSE(ctx->ru.rx.frag)) {
+#ifdef TOBE_PORTED
+ /* Must have routed through cnat_v4_frag_in2out node */
+ u16 *feature_data_ports = (u16 *)&ctx->feature_data[4];
+ dest_info.k.port = *feature_data_ports;
+#endif
+ } else {
+ dest_info.k.port = clib_net_to_host_u16(udp->dest_port);
+ }
+ #endif
+
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ CNAT_DB_TIMEOUT_RST(db);
+ goto update_pkt;
+ }
+
+ if(PREDICT_TRUE((db->dst_ipv4 == dest_info.k.ipv4) &&
+ (db->dst_port == dest_info.k.port))) {
+
+ CNAT_DB_TIMEOUT_RST(db);
+ goto update_pkt;
+ } else {
+ if (PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ /*
+ * update db counter, timer
+ */
+
+ CNAT_DB_TIMEOUT_RST(db);
+
+ } else if(PREDICT_TRUE(db->nsessions == 1)) {
+ /* Destn is not same as in main db. Multiple session
+ * scenario
+ */
+ //printf(">>> [pkt# %lu] src_ip: 0x%x, db ip: 0x%x, db port: %u; dest ip: 0x%x, dest port: %u\n",
+ // pkt_num, ntohl(ip->src_addr), db->dst_ipv4, db->dst_port, dest_info.k.ipv4, dest_info.k.port);
+
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+
+ if(PREDICT_TRUE(session_db != NULL)) {
+ /* session exists */
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ /* could not create session db - drop packet */
+ disposition = CNAT_V4_UDP_I2O_D;
+ counter = CNAT_V4_UDP_I2O_SESSION_DROP;
+ goto drop_pkt;
+ }
+
+ } else {
+ /* More than 2 sessions exists */
+
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+
+ if(PREDICT_FALSE(session_db != NULL)) {
+ /* session exists */
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ /* could not create session db - drop packet */
+ disposition = CNAT_V4_UDP_I2O_D;
+ counter = CNAT_V4_UDP_I2O_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ }
+
+update_pkt:
+ /*
+ * 1. update src ipv4 addr and src udp port
+ * 2. update ipv4 checksum and udp checksum
+ */
+ swap_ip_src_udp_port(ip, udp, db);
+ /*
+ * update db counter, timer
+ */
+
+ db->in2out_pkts++;
+
+ /*
+ * need to set outside vrf
+ * from db->out2in_key.k.vrf
+ */
+
+ /* Temporarily keeping this commented */
+ //PLATFORM_CNAT_SET_TX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ // db->out2in_key.k.vrf)
+
+ in2out_forwarding_count++;
+
+ } else {
+ disposition = CNAT_V4_UDP_I2O_E;
+ counter = CNAT_V4_UDP_I2O_MISS_PKT;
+ }
+
+drop_pkt:
+pcp_pkt:
+
+ em->counters[node_counter_base_index + counter] += 1;
+
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_udp_inside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_udp_inside_input_node) = {
+ .function = cnat_ipv4_udp_inside_input_node_fn,
+ .name = "vcgn-v4-udp-i2o",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_udp_inside_input_error_strings),
+ .error_strings = cnat_ipv4_udp_inside_input_error_strings,
+
+ .n_next_nodes = CNAT_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_I2O_FIXME] = "error-drop",
+ // [CNAT_V4_UDP_I2O_T] = "ip4-input",
+ [CNAT_V4_UDP_I2O_E] = "vcgn-v4-udp-i2o-e",
+ [CNAT_REWRITE_OUTPUT] = "ip4-input",
+ },
+};
+
+clib_error_t *cnat_ipv4_udp_inside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_udp_inside_input_main_t * mp = &cnat_ipv4_udp_inside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_udp_inside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input_exceptions.c b/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input_exceptions.c
new file mode 100644
index 00000000000..f078c8d4391
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_udp_inside_input_exceptions.c
@@ -0,0 +1,283 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_udp_inside_input_exception_stages.c - cnat_ipv4_udp_inside_input_exception node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_global.h"
+#include "cnat_db.h"
+#include "cnat_ipv4_udp.h"
+
+/*
+ * Dump these counters via the "show error" CLI command
+ */
+
+#define foreach_cnat_ipv4_udp_inside_input_exc_error \
+_(CNAT_V4_UDP_I2O_T_PKT, "v4 udp i2o transmit") \
+_(CNAT_V4_UDP_I2O_D_PKT, "v4 udp i2o drop") \
+_(CNAT_V4_ICMP_G_I2O_T_PKT, "v4 udp i2o icmp msg gen") \
+_(CNAT_V4_UDP_I2O_DC_PKT, "v4 udp i2o (no config) drop") \
+_(CNAT_V4_UDP_I2O_DR_PKT, "v4 udp i2o (not in run state) drop") \
+_(CNAT_V4_UDP_I2O_DD_PKT, "v4 udp i2o (no direct port) drop") \
+_(CNAT_V4_UDP_I2O_DA_PKT, "v4 udp i2o (no any port) drop") \
+_(CNAT_V4_UDP_I2O_DO_PKT, "v4 udp i2o (out of port limit) drop") \
+_(CNAT_V4_UDP_I2O_DI_PKT, "v4 udp i2o (invalid packet) drop") \
+_(CNAT_V4_UDP_I2O_DS_PKT, "v4 udp i2o (no sessoon db) drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_udp_inside_input_exc_error
+#undef _
+ CNAT_IPV4_UDP_INSIDE_INPUT_EXCEPTIONS_N_ERROR,
+} cnat_ipv4_udp_inside_input_exc_error_t;
+
+
+static char * cnat_ipv4_udp_inside_input_exc_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_udp_inside_input_exc_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_udp_inside_input_exc_main_t;
+
+typedef enum {
+ CNAT_V4_UDP_I2O_T,
+ CNAT_V4_UDP_I2O_D,
+ CNAT_V4_ICMP_G_I2O_T = CNAT_V4_UDP_I2O_D, /* TOBE_PORTED */
+ CNAT_V4_UDP_INSIDE_INPUT_EXC_N_NEXT,
+} cnat_ipv4_udp_inside_input_exc_next_t;
+
+cnat_ipv4_udp_inside_input_exc_main_t cnat_ipv4_udp_inside_input_exc_main;
+vlib_node_registration_t cnat_ipv4_udp_inside_input_exc_node;
+
+#define NSTAGES 2
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ vlib_node_t *n =
+ vlib_get_node (vm, cnat_ipv4_udp_inside_input_exc_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+
+ cnat_gen_icmp_info info;
+ cnat_db_key_bucket_t ki;
+ spp_ctx_t *ctx __attribute__((unused))
+ = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ cnat_main_db_entry_t *db = NULL;
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8*)ip + ipv4_hdr_len);
+ int disposition = CNAT_V4_UDP_I2O_T;
+ int counter = CNAT_V4_UDP_I2O_T_PKT;
+
+ cnat_key_t dest_info;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_UDP)
+
+ vnet_buffer(b0)->vcgn_uii.key.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ ki.k.k.vrf, CNAT_UDP)
+
+ ki.k.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+
+
+ /* MUST REVISIT: commentting frag check. Unconditional destination port
+ * update. DONOT remove this #if 0 */
+ ki.k.k.port =
+ clib_net_to_host_u16(udp->src_port);
+ dest_info.k.port =
+ clib_net_to_host_u16(udp->dest_port);
+#if 0
+ if(PREDICT_FALSE(ctx->ru.rx.frag)) {
+#ifdef TOBE_PORTED
+ /* Must have routed through cnat_v4_frag_in2out node */
+ u16 *feature_data_ports = (u16 *)&ctx->feature_data[2];
+ ki.k.k.port = *feature_data_ports;
+ feature_data_ports++;
+ dest_info.k.port = *feature_data_ports;
+#endif
+ } else {
+ ki.k.k.port =
+ clib_net_to_host_u16(udp->src_port);
+ dest_info.k.port =
+ clib_net_to_host_u16(udp->dest_port);
+ }
+#endif /* if 0 */
+
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->dest_addr);
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ dest_info.k.vrf, CNAT_UDP)
+
+ if (PREDICT_TRUE(ki.k.k.port)) {
+ if (ki.k.k.port & 0x1) {
+ db = cnat_get_main_db_entry_v2(&ki, PORT_S_ODD, PORT_TYPE_DYNAMIC,
+ &info, &dest_info);
+ } else {
+ db = cnat_get_main_db_entry_v2(&ki, PORT_S_EVEN, PORT_TYPE_DYNAMIC,
+ &info, &dest_info);
+ }
+ } else {
+ /*
+ * No UDP port value of 0 - drop it
+ */
+ db = NULL;
+ info.error = CNAT_ERR_BAD_TCP_UDP_PORT;
+ }
+
+ if (PREDICT_TRUE((u64)db)) {
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ /*
+ * step 6 do nat before fwd pkt
+ */
+ swap_ip_src_udp_port(ip, udp, db);
+ /*
+ * update db for this pkt
+ */
+ CNAT_DB_UPDATE_IN2OUT_TIMER
+
+ /* Check timeout db if there is config for this */
+ (void) query_and_update_db_timeout((void *)db, MAIN_DB_TYPE);
+
+/* Temporarily keeping it commented */
+ //PLATFORM_CNAT_SET_TX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ // db->out2in_key.k.vrf)
+ in2out_forwarding_count++;
+
+ } else {
+ switch (info.error) {
+ case (CNAT_NO_VRF_RUN):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DR_PKT] += 1;
+ break;
+ case (CNAT_OUT_LIMIT):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DO_PKT] += 1;
+ break;
+ case (CNAT_NO_PORT_ANY):
+ case (CNAT_NO_POOL_ANY):
+ case (CNAT_BAD_INUSE_ANY):
+ case (CNAT_NOT_FOUND_ANY):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DA_PKT] += 1;
+ break;
+ case (CNAT_INV_PORT_DIRECT):
+ case (CNAT_DEL_PORT_DIRECT):
+ case (CNAT_BAD_INUSE_DIRECT):
+ case (CNAT_NOT_FOUND_DIRECT):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DD_PKT] += 1;
+ break;
+ case (CNAT_ERR_BAD_TCP_UDP_PORT):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DI_PKT] += 1;
+ break;
+ case (CNAT_ERR_NO_SESSION_DB):
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DS_PKT] += 1;
+ break;
+ default:
+ em->counters[node_counter_base_index + CNAT_V4_UDP_I2O_DC_PKT] += 1;
+ break;
+ }
+ /*
+ * send to icmp msg generate node
+ */
+ if (info.gen_icmp_msg == CNAT_ICMP_MSG) {
+#ifdef TOBE_PORTED
+ u32 *fd = (u32*)ctx->feature_data;
+ fd[0] = info.svi_addr;
+ fd[1] = CNAT_ICMP_DEST_UNREACHABLE;
+#endif
+ disposition = CNAT_V4_ICMP_G_I2O_T;
+ counter = CNAT_V4_ICMP_G_I2O_T_PKT;
+ } else {
+ disposition = CNAT_V4_UDP_I2O_D;
+ counter = CNAT_V4_UDP_I2O_D_PKT;
+ }
+ DEBUG_I2O_DROP(CNAT_DEBUG_DROP_UDP)
+ }
+
+ em->counters[node_counter_base_index + counter] += 1;
+
+ return disposition;
+}
+
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_udp_inside_input_exc_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+VLIB_REGISTER_NODE (cnat_ipv4_udp_inside_input_exc_node) = {
+ .function = cnat_ipv4_udp_inside_input_exc_node_fn,
+ .name = "vcgn-v4-udp-i2o-e",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_udp_inside_input_exc_error_strings),
+ .error_strings = cnat_ipv4_udp_inside_input_exc_error_strings,
+
+ .n_next_nodes = CNAT_V4_UDP_INSIDE_INPUT_EXC_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [CNAT_V4_UDP_I2O_T] = "ip4-input",
+ [CNAT_V4_UDP_I2O_D] = "error-drop",
+ },
+};
+
+
+clib_error_t *cnat_ipv4_udp_inside_input_exc_init (vlib_main_t *vm)
+{
+ cnat_ipv4_udp_inside_input_exc_main_t * mp = &cnat_ipv4_udp_inside_input_exc_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_udp_inside_input_exc_init);
+
diff --git a/vnet/vnet/vcgn/cnat_ipv4_udp_outside_input.c b/vnet/vnet/vcgn/cnat_ipv4_udp_outside_input.c
new file mode 100644
index 00000000000..203568c8e0d
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ipv4_udp_outside_input.c
@@ -0,0 +1,605 @@
+
+/*
+ *---------------------------------------------------------------------------
+ * cnat_ipv4_udp_outside_input_stages.c - cnat_ipv4_udp_outside_input node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_ipv4_udp.h"
+#include "dslite_db.h"
+#include "cnat_db.h"
+#include "cnat_v4_functions.h"
+
+//#include <dslite_v6_functions.h>
+//#include <pool.h>
+//#include "cnat_va_db.h"
+
+#define foreach_cnat_ipv4_udp_outside_input_error \
+_(CNAT_V4_UDP_O2I_T_PKT, "v4 udp o2i transmit") \
+_(CNAT_V4_DSLITE_ENCAP_CTR, "to dslite encap") \
+_(CNAT_V4_UDP_O2I_MISS_PKT, "v4 udp o2i db miss drop") \
+_(CNAT_V4_UDP_O2I_TTL_GEN, "v4 udp o2i TTL gen") \
+_(CNAT_V4_UDP_O2I_TTL_DROP, "v4 udp o2i TTL drop") \
+_(CNAT_V4_UDP_O2I_PTB_GEN, "v4 ptb gen") \
+_(CNAT_V4_UDP_O2I_PTB_DROP, "v4 ptb throttle drop") \
+_(CNAT_V4_UDP_O2I_SESSION_DROP, "v4 udp o2i session drop") \
+_(CNAT_V4_UDP_O2I_FILTER_DROP, "v4 udp o2i drop: end point filtering") \
+_(CNAT_V4_UDP_O2I_SUB_FRAG_NO_DB_DROP, "v4 udp o2i subsequent frag no DB drop") \
+_(CNAT_V4_UDP_O2I_1ST_FRAG_FILTER_DROP, "v4 udp i2o 1st frag filter drop")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_udp_outside_input_error
+#undef _
+ CNAT_IPV4_UDP_OUTSIDE_INPUT_N_ERROR,
+} cnat_ipv4_udp_outside_input_t;
+
+static char * cnat_ipv4_udp_outside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_udp_outside_input_error
+#undef _
+};
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ipv4_udp_outside_input_main_t;
+
+typedef enum {
+ //CNAT_V4_O2I_FIXME,
+ CNAT_V4_UDP_O2I_E,
+ CNAT_V4_UDP_O2I_T,
+ CNAT_V4_UDP_O2I_NEXT,
+} cnat_ipv4_udp_outside_input_next_t;
+
+//#define CNAT_V4_DSLITE_ENCAP CNAT_V4_O2I_FIXME
+//#define CNAT_V4_UDP_O2I_E CNAT_V4_O2I_FIXME
+
+cnat_ipv4_udp_outside_input_main_t cnat_ipv4_udp_outside_input_main;
+vlib_node_registration_t cnat_ipv4_udp_outside_input_node;
+
+#define NSTAGES 6
+
+/*
+ * Use the generic buffer metadata + first line of packet data prefetch
+ * stage function from <api/pipeline.h>. This is usually a Good Idea.
+ */
+#define stage0 generic_stage0
+
+
+#if 0
+typedef struct cnat_ipv4_udp_outside_input_pipeline_data_ {
+ //spp_node_main_vector_t *nmv;
+ dslite_common_pipeline_data_t common_data;
+ /* Add additional pipeline stage data here... */
+ u32 bucket;
+#ifdef DSLITE_DEF
+ u32 user_bucket;
+ dslite_v4_to_v6_udp_counter_t *udp_counter;
+ dslite_icmp_gen_counter_t *icmp_gen_counter;
+
+#endif
+ cnat_key_t ki;
+ udp_hdr_type_t *udp;
+ u8 frag_pkt;
+} cnat_ipv4_udp_outside_input_pipeline_data_t;
+
+#endif
+
+#define CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(db,dslite_nat44_inst_id) \
+ if (PREDICT_FALSE(!(db->flags & CNAT_DB_FLAG_UDP_ACTIVE))) { \
+ db->flags |= CNAT_DB_FLAG_UDP_ACTIVE; \
+ CNAT_DB_TIMEOUT_RST(db); \
+ } else if (PREDICT_FALSE(db->flags & CNAT_DB_DSLITE_FLAG)) { \
+ if (PREDICT_TRUE(dslite_table_db_ptr[dslite_nat44_inst_id].mapping_refresh_both_direction)) { \
+ CNAT_DB_TIMEOUT_RST(db); \
+ } \
+ } else if (PREDICT_TRUE(mapping_refresh_both_direction)) { \
+ CNAT_DB_TIMEOUT_RST(db); \
+ } \
+
+#if 0
+static cnat_ipv4_udp_outside_input_pipeline_data_t pctx_data[SPP_MAXDISPATCH];
+#define EXTRA_PIPELINE_ARGS_PROTO , cnat_ipv4_udp_outside_input_pipeline_data_t *pctx
+#define EXTRA_PIPELINE_ARGS , pctx
+
+#endif
+
+/*inline u32
+is_static_dest_nat_enabled(u16 vrf)
+{
+ if(static_dest_vrf_map_array[vrf] == 1) {
+ return CNAT_SUCCESS;
+ }
+ return CNAT_NO_CONFIG;
+}*/
+
+inline void swap_ip_dst(ipv4_header *ip,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+
+ CNAT_UPDATE_L3_CHECKSUM_DECLARE
+ /*
+ * calculate checksum
+ */
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (clib_host_to_net_u16(ip->checksum)),
+ ((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)))
+ //set ip header
+ ip->dest_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+#if 0
+
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 1;
+ if(cnat_static_dest_db_get_translation(ip->src_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+ old_ip = spp_net_to_host_byte_order_32(&(ip->src_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)))
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+ ip->src_addr = postmap_ip;
+ }
+ }
+#endif
+}
+
+inline void swap_ip_dst_udp_port(ipv4_header *ip,
+ udp_hdr_type_t *udp,
+ cnat_main_db_entry_t *db, u16 vrf)
+{
+
+#define UDP_PACKET_DEBUG 1
+
+// Temporary debugs which will be suppressed later
+#ifdef UDP_PACKET_DEBUG
+ if (PREDICT_FALSE(udp_outside_packet_dump_enable)) {
+ printf("\nOut2In UDP packet before translation");
+ print_udp_pkt(ip);
+ }
+#endif
+
+#if 0
+ if(is_static_dest_nat_enabled(vrf) == CNAT_SUCCESS) {
+ direction = 1;
+ if(cnat_static_dest_db_get_translation(ip->src_addr, &postmap_ip, vrf, direction) == CNAT_SUCCESS) {
+
+ CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE
+
+ old_ip = spp_net_to_host_byte_order_32(&(ip->src_addr));
+ old_postmap_ip = spp_net_to_host_byte_order_32(&postmap_ip);
+
+ CNAT_UPDATE_L3_L4_CHECKSUM(((u16)(old_ip & 0xFFFF)),
+ ((u16)(old_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(udp->src_port))),
+ (spp_net_to_host_byte_order_16(&(ip->checksum))),
+ (spp_net_to_host_byte_order_16(&(udp->udp_checksum))),
+ ((u16)(old_postmap_ip & 0xFFFF)),
+ ((u16)(old_postmap_ip >> 16)),
+ (spp_net_to_host_byte_order_16(&(udp->src_port))))
+
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+ ip->src_addr = postmap_ip;
+ if (PREDICT_TRUE(udp->udp_checksum)) {
+ udp->udp_checksum = clib_host_to_net_u16(new_l4_c);
+ }
+ }
+ }
+#endif
+ /*
+ * declare variable
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE
+ /*
+ * calculate checksum
+ */
+ CNAT_UPDATE_L3_L4_CHECKSUM(((u16)(db->out2in_key.k.ipv4)),
+ ((u16)(db->out2in_key.k.ipv4 >> 16)),
+ (db->out2in_key.k.port),
+ (clib_net_to_host_u16(ip->checksum)),
+ (clib_net_to_host_u16(udp->udp_checksum)),
+ ((u16)(db->in2out_key.k.ipv4)),
+ ((u16)(db->in2out_key.k.ipv4 >> 16)),
+ (db->in2out_key.k.port))
+
+
+
+
+ //set ip header
+ ip->dest_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ ip->checksum =
+ clib_host_to_net_u16(new_l3_c);
+
+ //set udp header
+ udp->dest_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ /*
+ * No easy way to avoid this if check except by using
+ * complex logic - may not be worth it.
+ */
+ if (PREDICT_TRUE(udp->udp_checksum)) {
+ udp->udp_checksum = clib_host_to_net_u16(new_l4_c);
+ }
+
+
+
+// Temporary debugs which will be suppressed later
+#ifdef UDP_PACKET_DEBUG
+ if (PREDICT_FALSE(udp_outside_checksum_disable)) {
+ printf("\nOut2In UDP checksum 0x%x disabled by force", new_l4_c);
+ udp->udp_checksum = 0;
+ }
+ if (PREDICT_FALSE(udp_outside_packet_dump_enable)) {
+ printf("\nOut2In UDP packet after translation");
+ print_udp_pkt(ip);
+ }
+#endif
+}
+
+static inline void
+stage1(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ u64 a, b, c;
+ u32 bucket;
+ u8 *prefetch_target;
+
+ vlib_buffer_t * b0 = vlib_get_buffer (vm, buffer_index);
+ ipv4_header *ip = vlib_buffer_get_current (b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8*)ip + ipv4_hdr_len);
+
+ u64 tmp = 0;
+ tmp = vnet_buffer(b0)->vcgn_uii.key.k.ipv4 =
+ clib_net_to_host_u32(ip->dest_addr);
+ vnet_buffer(b0)->vcgn_uii.key.k.port =
+ clib_net_to_host_u16 (udp->dest_port);
+
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.port) << 32;
+
+ PLATFORM_CNAT_SET_RX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_RX],
+ vnet_buffer(b0)->vcgn_uii.key.k.vrf,
+ CNAT_UDP)
+ tmp |= ((u64)vnet_buffer(b0)->vcgn_uii.key.k.vrf) << 48;
+
+ CNAT_V4_GET_HASH(tmp, bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_out2in_hash[bucket]);
+ vnet_buffer(b0)->vcgn_uii.bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ CLIB_PREFETCH(prefetch_target, CLIB_CACHE_LINE_BYTES, LOAD);
+}
+
+static inline void
+stage2(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{ /* nothing */ }
+
+#define SPP_LOG2_CACHE_LINE_BYTES 6
+#define SPP_CACHE_LINE_BYTES (1 << SPP_LOG2_CACHE_LINE_BYTES)
+
+static inline void
+stage3(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ uword prefetch_target0, prefetch_target1;
+ u32 bucket = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /* read the hash bucket */
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket
+ = cnat_out2in_hash[bucket].next;
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+ CLIB_PREFETCH((void*)prefetch_target0, CLIB_CACHE_LINE_BYTES, STORE);
+ /* Just beyond DB key #2 */
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+ /* If the targets are in different lines, do the second prefetch */
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+ CLIB_PREFETCH((void *)prefetch_target1, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+ }
+}
+
+static inline void
+stage4(vlib_main_t * vm, vlib_node_runtime_t * node, u32 buffer_index)
+{
+ cnat_main_db_entry_t *db;
+ vlib_buffer_t * b0 = vlib_get_buffer(vm, buffer_index);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->out2in_key.key64 ==
+ vnet_buffer(b0)->vcgn_uii.key.key64)) {
+ break;
+ }
+ db_index = db->out2in_hash.next;
+ } while (db_index != EMPTY);
+
+ /* Stick the answer back into the pipeline context structure */
+ vnet_buffer(b0)->vcgn_uii.bucket = db_index;
+ }
+}
+
+#if 0
+
+ALWAYS_INLINE(
+static inline void
+stage5(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ spp_ctx_t *ctx = ctxs[index];
+ u32 db_index = pctx[index].bucket;
+ /* for nat44, dslite_id will be 1 */
+ u16 dslite_id = *(pctx[index].common_data.dslite_id_ptr);
+
+ DSLITE_PREFETCH_COUNTER(pctx[index].udp_counter,
+ &dslite_all_counters[dslite_id].v46_udp_counters,
+ dslite_v4_to_v6_udp_counter_t,
+ v4_to_v6_udp_output_count,
+ "V4_TO_V6_UDP")
+
+ DSLITE_PREFETCH_COUNTER(pctx[index].icmp_gen_counter,
+ &dslite_all_counters[dslite_id].dslite_icmp_gen_counters,
+ dslite_icmp_gen_counter_t,
+ v6_icmp_gen_count,
+ "V4_TO_V6_icmp")
+
+if (PREDICT_TRUE(db_index != EMPTY)) {
+ cnat_main_db_entry_t *db = cnat_main_db + db_index;
+
+ u32 user_db_index = db->user_index;
+ DSLITE_PRINTF(1, "UDP o2i, db entry found %u %u %u\n",
+ db_index, user_db_index,
+ db->dslite_nat44_inst_id);
+ uword prefetch_target0 = (uword)(cnat_user_db + user_db_index);
+ SPP_PREFETCH(prefetch_target0, 0, LOAD);
+ pctx[index].user_bucket = user_db_index;
+ DSLITE_PRINTF(1, "UDP: Done with prefetch..\n");
+} else {
+ DSLITE_PRINTF(1, "UDP: Stage 5, db_index empty...\n");
+}
+}
+
+#endif
+
+
+static inline u32 last_stage (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 bi)
+{
+
+ vlib_buffer_t *b0 = vlib_get_buffer (vm, bi);
+ u32 db_index = vnet_buffer(b0)->vcgn_uii.bucket;
+ //spp_ctx_t *ctx = (spp_ctx_t *) &vnet_buffer(b0)->vcgn_uii;
+ int disposition = CNAT_V4_UDP_O2I_T;
+ int counter = CNAT_V4_UDP_O2I_T_PKT;
+ ipv4_header *ip = (ipv4_header *)vlib_buffer_get_current(b0);
+ u8 ipv4_hdr_len = (ip->version_hdr_len_words & 0xf) << 2;
+ udp_hdr_type_t *udp = (udp_hdr_type_t *)((u8*)ip + ipv4_hdr_len);
+ vlib_node_t *n = vlib_get_node (vm, cnat_ipv4_udp_outside_input_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ cnat_session_entry_t *session_db = NULL;
+ cnat_main_db_entry_t *db = NULL;
+ cnat_key_t dest_info;
+ u16 dslite_nat44_inst_id __attribute__((unused)) = 0;
+
+ dest_info.k.port = clib_net_to_host_u16(udp->src_port);
+ dest_info.k.ipv4 = clib_net_to_host_u32(ip->src_addr);
+
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+ /* TTL gen was disabled for nat44 earlier
+ * But since dslite has got integrated in this
+ * TTL gen is enabled
+ */
+
+ db = cnat_main_db + db_index;
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(db, 0)
+ goto update_pkt;
+ }
+
+
+ if(PREDICT_TRUE((db->dst_ipv4 == dest_info.k.ipv4) &&
+ (db->dst_port == dest_info.k.port))) {
+
+ CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(db, 0)
+ goto update_pkt;
+ } else {
+ /* The session entries belonging to this entry are checked to find
+ * if an entry exist whose destination IP and port match with the
+ * source IP and port of the packet being processed
+ */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+
+ if (PREDICT_FALSE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(db, 0)
+
+ } else if(PREDICT_TRUE(db->nsessions == 1)) {
+
+ /* Destn is not same as in main db. Multiple session
+ * scenario
+ */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_V4_UDP_O2I_E;
+ counter = CNAT_V4_UDP_O2I_SESSION_DROP;
+ goto drop_pkt;
+ }
+
+ /* update session_db(cur packet) timer */
+ CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(session_db, 0)
+ } else {
+ /* More 2 sessions exists */
+
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+
+ if(PREDICT_FALSE(session_db != NULL)) {
+ /* session exists */
+ CNAT_UDP_OUTSIDE_UPDATE_FLAG_TIMER(session_db, 0)
+ } else {
+ /* could not create session db - drop packet */
+ disposition = CNAT_V4_UDP_O2I_E;
+ counter = CNAT_V4_UDP_O2I_SESSION_DROP;
+ goto drop_pkt;
+ }
+ }
+ }
+
+update_pkt:
+
+ /*
+ * 1. update dest ipv4 addr and dest udp port
+ * 2. update ipv4 checksum and udp checksum
+ */
+ //swap_ip_dst(ip, db, db->in2out_key.k.vrf);
+ swap_ip_dst_udp_port(ip, udp, db, db->in2out_key.k.vrf);
+ //DSLITE_PRINTF(1, "Done with swap_ip_dst_udp_port..\n");
+
+ db->out2in_pkts++;
+
+ nat44_dslite_global_stats[0].out2in_forwarding_count++;
+
+ /* #### Temporarily COMMENTED FOR IP ROUTE LOOKUP ISSUE #### */
+
+ //PLATFORM_CNAT_SET_TX_VRF(vnet_buffer(b0)->sw_if_index[VLIB_TX],
+ // db->in2out_key.k.vrf)
+ } else {
+ disposition = CNAT_V4_UDP_O2I_E;
+ counter = CNAT_V4_UDP_O2I_MISS_PKT;
+ /* for NAT44 dslite_id would be 1 */
+ nat44_dslite_common_stats[0].no_translation_entry_drops ++;
+ }
+
+drop_pkt:
+
+ em->counters[node_counter_base_index + counter] += 1;
+ return disposition;
+}
+
+#include <vnet/pipeline.h>
+
+static uword cnat_ipv4_udp_outside_input_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ return dispatch_pipeline (vm, node, frame);
+}
+
+
+VLIB_REGISTER_NODE (cnat_ipv4_udp_outside_input_node) = {
+ .function = cnat_ipv4_udp_outside_input_node_fn,
+ .name = "vcgn-v4-udp-o2i",
+ .vector_size = sizeof (u32),
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(cnat_ipv4_udp_outside_input_error_strings),
+ .error_strings = cnat_ipv4_udp_outside_input_error_strings,
+
+ .n_next_nodes = CNAT_V4_UDP_O2I_NEXT,
+
+ /* edit / add dispositions here */
+#if 0
+ .next_nodes = {
+ //[CNAT_V4_O2I_FIXME] = "error-drop",
+ //[CNAT_V4_UDP_O2I_E] = "vcgn-v4-udp-o2i-e",
+ [CNAT_V4_UDP_O2I_E] = "vcgn-v4-udp-o2i-e",
+ [CNAT_V4_UDP_O2I_T] = "ip4-input",
+ },
+#endif
+ .next_nodes = {
+ [CNAT_V4_UDP_O2I_E] = "error-drop",
+ [CNAT_V4_UDP_O2I_T] = "ip4-input",
+ },
+
+};
+
+clib_error_t *cnat_ipv4_udp_outside_input_init (vlib_main_t *vm)
+{
+ cnat_ipv4_udp_outside_input_main_t * mp = &cnat_ipv4_udp_outside_input_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (cnat_ipv4_udp_outside_input_init);
diff --git a/vnet/vnet/vcgn/cnat_log_api.h b/vnet/vnet/vcgn/cnat_log_api.h
new file mode 100644
index 00000000000..60cf683697d
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_log_api.h
@@ -0,0 +1,114 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_log_api.h
+ * Declraes the common APIs for logging (both syslog and NFV9)
+ * Copyright (c) 2013, 20122 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_LOG_API_H__
+#define __CNAT_LOG_API_H__
+
+#include "cnat_logging.h"
+
+static inline void cnat_log_ds_lite_mapping_delete(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+static inline void cnat_log_ds_lite_mapping_create(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+static inline void cnat_log_ds_lite_port_limit_exceeded(
+ dslite_key_t * key,
+ dslite_table_entry_t *dslite_entry_ptr)
+{
+ return;
+
+}
+
+static inline void cnat_log_nat44_port_limit_exceeded(
+ cnat_key_t * key,
+ cnat_vrfmap_t *vrfmap)
+{
+ return;
+}
+static inline void cnat_log_nat44_mapping_create(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+static inline void cnat_log_nat44_mapping_delete(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+/* Session Logging API for nat44 */
+static inline void cnat_session_log_nat44_mapping_create (
+ cnat_main_db_entry_t *db,
+ cnat_session_entry_t *sdb,
+ cnat_vrfmap_t *vrfmap )
+{
+ return;
+}
+
+static inline void cnat_session_log_nat44_mapping_delete (
+ cnat_main_db_entry_t *db,
+ cnat_session_entry_t *sdb,
+ cnat_vrfmap_t *vrfmap )
+{
+ return;
+}
+
+/* Session Logging API for dslite */
+static inline void cnat_session_log_ds_lite_mapping_create (
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t *sdb )
+{
+ return;
+}
+
+static inline void cnat_session_log_ds_lite_mapping_delete (
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t *sdb )
+{
+ return;
+}
+
+#endif /* #ifndef __CNAT_LOG_API_H__ */
+
diff --git a/vnet/vnet/vcgn/cnat_log_common.h b/vnet/vnet/vcgn/cnat_log_common.h
new file mode 100644
index 00000000000..52731bc0028
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_log_common.h
@@ -0,0 +1,79 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_log_common.h
+ * Contains macros and definitions that are common to both syslog and nfv9
+ * Copyright (c) 2011-2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_LOG_COMMON_H__
+#define __CNAT_LOG_COMMON_H__
+
+#include <stdio.h>
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include "cnat_db.h"
+#include "nat64_db.h"
+#include "spp_timers.h"
+#include "spp_ctx.h"
+
+/*
+ * This corresponds to the length of the IMETRO SHIM Header for RODDICK
+ * For non-roddick cases, introduce an Ethernet header as well
+ */
+#if 0
+ #if defined(TARGET_RODDICK)
+ #define CNAT_NFV9_SHIM_HDR_OFFSET 8
+ #define CNAT_NFV9_L2_ENCAPS_OFFSET 0
+ #else
+ #define CNAT_NFV9_SHIM_HDR_OFFSET 0
+ #define CNAT_NFV9_L2_ENCAPS_OFFSET 16
+ #endif
+#endif
+
+ #define CNAT_NFV9_IP_HDR_OFFSET 0
+
+ #define CNAT_NFV9_UDP_HDR_OFFSET \
+ (CNAT_NFV9_IP_HDR_OFFSET + sizeof(ipv4_header))
+
+ #define CNAT_NFV9_HDR_OFFSET \
+ (CNAT_NFV9_UDP_HDR_OFFSET + sizeof(udp_hdr_type_t))
+
+u32 cnat_get_sys_up_time_in_ms(void);
+u32 cnat_get_unix_time_in_seconds(void);
+void cnat_dump_time_change_logs(void);
+void cnat_handle_sys_time_change (time_t current_unix_time);
+/*
+ * Maximum number of time log changes we maintain
+ */
+
+#define MAX_TIME_CHANGE_LOGS (8)
+
+typedef struct {
+ /*
+ * A timer structure to periodically send NFv9 & syslog logging packets
+ * that have been waiting to be full for a long time. This will
+ * ensure add/delete events don't get delayed too much before they
+ * are sent to the collector.
+ */
+ spp_timer_t log_timer;
+
+ /*
+ * Whether we have initialized the NFv9 information
+ */
+ u8 cnat_log_init_done;
+} cnat_log_global_info_t;
+
+#endif /* __CNAT_LOG_COMMON_H__ */
diff --git a/vnet/vnet/vcgn/cnat_logging.c b/vnet/vnet/vcgn/cnat_logging.c
new file mode 100644
index 00000000000..eace942c657
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_logging.c
@@ -0,0 +1,3566 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_logging.c
+ *
+ * Copyright (c) 2009-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ip/ip4.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/format.h>
+#include <vnet/ip/udp.h>
+
+
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_v4_functions.h"
+#include "tcp_header_definitions.h"
+#include "cnat_v4_ftp_alg.h"
+#include "cnat_logging.h"
+#include "platform_common.h"
+
+#define CNAT_NFV9_DEBUG_CODE 2
+#if CNAT_NFV9_DEBUG_CODE > 3
+
+#define NFV9_COND if ((my_instance_number != 0) && (my_instance_number != 15))
+
+#define NFV9_DEBUG_PRINTF1(a) NFV9_COND printf(a);
+#define NFV9_DEBUG_PRINTF2(a, b) NFV9_COND printf(a, b);
+#define NFV9_DEBUG_PRINTF3(a, b, c) NFV9_COND printf(a, b, c);
+#define NFV9_DEBUG_PRINTF4(a, b, c, d) NFV9_COND printf(a, b, c, d);
+
+#else
+
+#define NFV9_DEBUG_PRINTF1(a)
+#define NFV9_DEBUG_PRINTF2(a, b)
+#define NFV9_DEBUG_PRINTF3(a, b, c)
+#define NFV9_DEBUG_PRINTF4(a, b, c, d)
+
+#endif
+
+static void cnat_nfv9_insert_ingress_vrfid_name_record(cnat_nfv9_logging_info_t *nfv9_logging_info, u16 index);
+void cnat_nfv9_ingress_vrfid_name_mapping_create(
+ cnat_nfv9_logging_info_t *nfv9_logging_info);
+
+
+cnat_nfv9_global_info_t cnat_nfv9_global_info;
+
+cnat_nfv9_template_t cnat_nfv9_template_info;
+
+#define CNAT_NFV9_OPTION_TEMPLATE cnat_nfv9_template_info.cnat_nfv9_option_template
+
+u16 cnat_template_id[MAX_RECORDS] =
+ {0, CNAT_NFV9_ADD_TEMPLATE_ID, CNAT_NFV9_DEL_TEMPLATE_ID,
+ CNAT_NFV9_NAT64_ADD_BIB_TEMPLATE_ID,CNAT_NFV9_NAT64_DEL_BIB_TEMPLATE_ID,
+ CNAT_NFV9_NAT64_ADD_SESSION_TEMPLATE_ID,
+ CNAT_NFV9_NAT64_DEL_SESSION_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_ADD_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_DEL_TEMPLATE_ID
+#ifndef NO_BULK_LOGGING
+ , CNAT_NFV9_NAT44_BULK_ADD_TEMPLATE_ID,
+ CNAT_NFV9_NAT44_BULK_DEL_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_BULK_ADD_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_BULK_DEL_TEMPLATE_ID
+#endif /* #ifndef NO_BULK_LOGGING */
+ , CNAT_NFV9_INGRESS_VRF_ID_NAME_TEMPLATE_ID,
+ CNAT_NFV9_NAT44_ADD_SESSION_TEMPLATE_ID,
+ CNAT_NFV9_NAT44_DEL_SESSION_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_ADD_SESSION_TEMPLATE_ID,
+ CNAT_NFV9_DS_LITE_DEL_SESSION_TEMPLATE_ID
+ };
+
+/*
+ * Logging information structures
+ */
+cnat_nfv9_logging_info_t cnat_default_nfv9_logging_info;
+cnat_nfv9_logging_info_t *cnat_nfv9_logging_info_pool;
+#define NFV9_SERVER_POOL_SIZE 16
+nfv9_server_info_t *nfv9_server_info_pool;
+
+u32 nfv9_src_id = 0;
+
+u32
+cnat_get_sys_up_time_in_ms (void)
+{
+ vlib_main_t * vm = vlib_get_main();
+ u32 cnat_curr_time;
+
+ cnat_curr_time = (u32)vlib_time_now (vm);
+ return cnat_curr_time;
+}
+
+void
+cnat_dump_time_change_logs (void)
+{
+ return;
+}
+
+inline void cnat_nfv9_handle_sys_time_change(time_t current_unix_time)
+{
+ return;
+ #if 0
+ cnat_handle_sys_time_change(current_unix_time);
+ #endif
+}
+
+void cnat_nfv9_update_sys_time_change()
+{
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info = NULL;
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ my_nfv9_logging_info->server_index;
+ server->last_template_sent_time = 0;
+ }));
+}
+
+void nfv9_params_show(u32 logging_index)
+{
+ cnat_nfv9_logging_info_t *log_info;
+ if(logging_index == EMPTY) {
+ PLATFORM_DEBUG_PRINT("\nNetflow logging not configured\n");
+ return;
+ }
+
+ log_info = cnat_nfv9_logging_info_pool + logging_index;
+ nfv9_server_info_t *server __attribute__((unused))
+ = nfv9_server_info_pool + log_info->server_index;
+
+
+ PLATFORM_DEBUG_PRINT("\nNetflow parameters --\n");
+ PLATFORM_DEBUG_PRINT("Server index %d IPV4 address: %x, port %d, max log size %d\n",
+ log_info->server_index, server->ipv4_address,
+ server->port, log_info->max_length_minus_max_record_size);
+
+ PLATFORM_DEBUG_PRINT("Server ref count %d Refresh rate %d timeout rate %d\n",
+ server->ref_count, server->refresh_rate,
+ server->timeout_rate);
+
+}
+
+/*
+ * Code to dump NFV9 packets before they are sent
+ */
+void
+cnat_nfv9_dump_logging_context (u32 value1,
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ u32 value2)
+{
+ u8 *pkt_ptr;
+ u32 i;
+ u32 next_nfv9_template_data_index = 0xffff;
+ u32 next_data_flow_index = 0xffff;
+ u32 next_data_record = 0xffff;
+ u32 data_record_size = 0;
+ vlib_main_t *vm = vlib_get_main();
+
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+
+ vlib_cli_output(vm,"\nDumping %s packet at locn %d: time 0x%x",
+ (value2 == 1) ? "CURRENT" : "QUEUED",
+ value1,
+ cnat_nfv9_get_unix_time_in_seconds());
+
+ vlib_cli_output(vm, "\ni_vrf 0x%x, ip_address 0x%x, port %d",
+ nfv9_logging_info->i_vrf,
+ server->ipv4_address,
+ server->port);
+
+ vlib_cli_output(vm,"\nseq_num %d",
+ server->sequence_num);
+
+ vlib_cli_output(vm,"\nlast_template_sent time 0x%x, pkts_since_last_template %d",
+ server->last_template_sent_time,
+ server->pkts_since_last_template);
+
+ vlib_cli_output(vm, "\npkt_len %d, add_rec_len %d, del_rec_len %d, total_rec_count %d",
+ nfv9_logging_info->pkt_length,
+ nfv9_logging_info->record_length[NAT44_ADD_RECORD],
+ nfv9_logging_info->record_length[NAT44_DEL_RECORD],
+ nfv9_logging_info->total_record_count);
+
+ vlib_cli_output(vm,"\nbulk_add_rec_len %d, bulk_del_rec_len %d",
+ nfv9_logging_info->record_length[NAT44_BULK_ADD_RECORD],
+ nfv9_logging_info->record_length[NAT44_BULK_DEL_RECORD]);
+
+ vlib_cli_output(vm,"\ncurr_logging_ctx 0x%p, timestamp 0x%x, queued_logging_ctx 0x%p",
+ nfv9_logging_info->current_logging_context,
+ nfv9_logging_info->current_logging_context_timestamp,
+ nfv9_logging_info->queued_logging_context);
+
+ vlib_cli_output(vm,"\nnfv9_hdr 0x%p, tmpl_hdr 0x%p, dataflow_hdr 0x%p",
+ nfv9_logging_info->nfv9_header,
+ nfv9_logging_info->nfv9_template_header,
+ nfv9_logging_info->dataflow_header);
+
+ vlib_cli_output(vm,"\nadd_rec 0x%p, del_rec 0x%p, next_data_ptr 0x%p",
+ nfv9_logging_info->record[NAT44_ADD_RECORD],
+ nfv9_logging_info->record[NAT44_DEL_RECORD],
+ nfv9_logging_info->next_data_ptr);
+
+ vlib_cli_output(vm,"\n");
+
+ pkt_ptr = vlib_buffer_get_current(nfv9_logging_info->current_logging_context);
+ /*
+ * Dump along with 8 bytes of SHIM header
+ */
+ for (i = 0; i < (nfv9_logging_info->pkt_length + CNAT_NFV9_IP_HDR_OFFSET);
+ i = i + 1) {
+ u8 c1, c2, c3;
+ if (i == CNAT_NFV9_IP_HDR_OFFSET) {
+ vlib_cli_output(vm,"\nIP_HEADER: \n");
+ } else if (i == CNAT_NFV9_UDP_HDR_OFFSET) {
+ vlib_cli_output(vm,"\nUDP_HEADER: \n");
+ } else if (i == CNAT_NFV9_HDR_OFFSET) {
+ vlib_cli_output(vm,"\nNFV9 Header: Version:Count: \n");
+ } else if (i == (CNAT_NFV9_HDR_OFFSET+4)) {
+ vlib_cli_output(vm,"\nBoot_Up_Time_In_ms: \n");
+ } else if (i == (CNAT_NFV9_HDR_OFFSET+8)) {
+ vlib_cli_output(vm, "\nUNIX_Time: \n");
+ } else if (i == (CNAT_NFV9_HDR_OFFSET+12)) {
+ vlib_cli_output(vm,"\nSeq_Num: \n");
+ } else if (i == (CNAT_NFV9_HDR_OFFSET+16)) {
+ vlib_cli_output(vm,"\nSource ID: \n");
+ } else if (i == (CNAT_NFV9_HDR_OFFSET+20)) {
+ if (nfv9_logging_info->nfv9_template_header) {
+ vlib_cli_output(vm,"\nNFV9 TEMPLATE HDR: \n");
+ next_nfv9_template_data_index = i + 4;
+ } else {
+ next_data_flow_index = i;
+ }
+ } else if (i == (CNAT_NFV9_TEMPLATE_OFFSET+CNAT_NFV9_TEMPLATE_LENGTH)) {
+ if (nfv9_logging_info->nfv9_template_header) {
+ next_data_flow_index = i;
+ }
+ }
+
+ if (i == next_nfv9_template_data_index) {
+ vlib_cli_output(vm,"\nNFV9 TEMPLATE DATA: \n");
+ } else if (i == next_data_flow_index) {
+ if (*(pkt_ptr + i) == 0x01) {
+ if (*(pkt_ptr + i + 1) == 0x00) {
+ data_record_size = 21;
+ next_data_record = i + 4;
+ next_data_flow_index = i + *(pkt_ptr + i + 3) +
+ *(pkt_ptr + i + 2)*0x100;
+ vlib_cli_output(vm,"\nADD_RECORD (total %d): next_data_flow_index (%d->%d)\n", (next_data_flow_index - i), i, next_data_flow_index);
+ } else if (*(pkt_ptr + i + 1) == 0x01) {
+ data_record_size = 11;
+ next_data_record = i + 4;
+ next_data_flow_index = i + *(pkt_ptr + i + 3) +
+ *(pkt_ptr + i + 2)*0x100;
+ vlib_cli_output(vm,"\nDEL_RECORD (total %d) : next_data_flow_index (%d->%d)\n", (next_data_flow_index - i), i, next_data_flow_index);
+ } else if (*(pkt_ptr + i + 1) == 0x09) {
+ data_record_size = 20;
+ next_data_record = i + 4;
+ next_data_flow_index = i + *(pkt_ptr + i + 3) +
+ *(pkt_ptr + i + 2)*0x100;
+ vlib_cli_output(vm,"\nBULK_ADD_RECORD (total %d) : next_data_flow_index (%d->%d)\n", (next_data_flow_index - i), i, next_data_flow_index);
+ } else if (*(pkt_ptr + i + 1) == 0x0a) {
+ data_record_size = 10;
+ next_data_record = i + 4;
+ next_data_flow_index = i + *(pkt_ptr + i + 3) +
+ *(pkt_ptr + i + 2)*0x100;
+ vlib_cli_output(vm,"\nBULK_DEL_RECORD (total %d) : next_data_flow_index (%d->%d)\n", (next_data_flow_index - i), i, next_data_flow_index);
+ }
+
+ }
+ } else if (i == next_data_record) {
+ vlib_cli_output(vm,"\n");
+ next_data_record += data_record_size;
+ }
+
+ c3 = *(pkt_ptr + i);
+
+ c2 = c3 & 0xf;
+ c1 = (c3 >> 4) & 0xf;
+
+
+ vlib_cli_output(vm,"%c%c ",
+ ((c1 <= 9) ? (c1 + '0') : (c1 - 10 + 'a')),
+ ((c2 <= 9) ? (c2 + '0') : (c2 - 10 + 'a')));
+
+ }
+ vlib_cli_output(vm,"\n");
+}
+
+/*
+ * edt: * * cnat_nfv9_pad_added_to_an_addr
+ *
+ * Returns the difference (no# of bytes) between new_addr
+ * & org_addr
+ *
+ * Argument: u8 *new_addr, u8 *org_addr
+ * returns the difference
+ */
+inline
+int cnat_nfv9_pad_added_to_an_addr(u8 *new_addr, u8 *org_addr)
+{
+ uword addr1 = (uword) new_addr;
+ uword addr2 = (uword) org_addr;
+ return (addr1 - addr2);
+}
+
+/*
+ * edt: * * cnat_nfv9_add_end_of_record_padding
+ *
+ * Tries to add padding to data_ptr to ensure it is word aligned
+ *
+ * Argument: u8 * data_ptr
+ * pointer to the data pointer
+ */
+inline
+u8 *cnat_nfv9_add_end_of_record_padding (u8 *data_ptr)
+{
+ uword tmp = (uword) data_ptr;
+ uword pad_value = (uword) NFV9_PAD_VALUE;
+
+ tmp = (tmp + pad_value) & (~pad_value);
+
+ return ((u8 *) tmp);
+}
+
+/*
+ * edt: * * cnat_nfv9_pad_end_of_record_length
+ *
+ * Tries to add padding to data_ptr to ensure it is word aligned
+ *
+ * Argument: u8 * data_ptr
+ * pointer to the data pointer
+ */
+inline
+u16 cnat_nfv9_pad_end_of_record_length (u16 record_length)
+{
+ u16 pad_value = NFV9_PAD_VALUE;
+
+ return ((record_length + pad_value) & (~pad_value));
+}
+
+/* get first interface address */
+static ip4_address_t *
+ip4_interface_first_address (ip4_main_t * im, u32 sw_if_index)
+{
+ ip_lookup_main_t * lm = &im->lookup_main;
+ ip_interface_address_t * ia = 0;
+ ip4_address_t * result = 0;
+
+ foreach_ip_interface_address (lm, ia, sw_if_index,
+ 1 /* honor unnumbered */,
+ ({
+ ip4_address_t * a = ip_interface_address_get_address (lm, ia);
+ result = a;
+ break;
+ }));
+ return result;
+}
+
+void fill_ip_n_udp_hdr (u32 ipv4_addr, u16 port,
+ cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ vlib_buffer_t * b0 = nfv9_logging_info->current_logging_context;
+ ipv4_header *ip_header = vlib_buffer_get_current(b0);
+ udp_hdr_type_t *udp_header = (udp_hdr_type_t *)((u8*)ip_header + sizeof(ipv4_header));
+ vlib_main_t *vm = vlib_get_main();
+ u16 ip_length __attribute__((unused));
+ u16 pkt_len = nfv9_logging_info->pkt_length;
+ ip4_address_t *ia0 = 0;
+ u16 src_port = 0x0a0a;
+
+ /*
+ * Clear the SHIM header fields. The PD nodes will set it
+ * appropriately.
+ */
+ PLATFORM_MEMSET_CNAT_LOG_PKT_DATA
+
+ /*
+ * Don't need a special define for 0x45 - IP version and hdr len
+ */
+ ip_header->version_hdr_len_words = 0x45;
+ ip_header->tos = 0;
+ ip_header->frag_flags_offset = 0;
+ ip_header->ttl = 0xff;
+ ip_header->protocol = UDP_PROT;
+ ip_header->dest_addr = clib_host_to_net_u32(ipv4_addr);
+ ip_length = vlib_buffer_length_in_chain (vm, b0);
+ ip_header->total_len_bytes = clib_host_to_net_u16(pkt_len);
+ ia0 = ip4_interface_first_address(&ip4_main, nfv9_logging_info->i_vrf_id);
+ ip_header->src_addr = ia0->as_u32;
+ udp_header->src_port = clib_host_to_net_u16(src_port);
+ udp_header->dest_port = clib_host_to_net_u16(port);
+ udp_header->udp_checksum = 0;
+ udp_header->udp_length =
+ clib_host_to_net_u16(pkt_len - sizeof(ipv4_header));
+ ip_header->checksum = ip4_header_checksum((ip4_header_t *)ip_header);
+}
+
+/*
+ * edt: * * cnat_nfv9_fill_nfv9_ip_header
+ *
+ * Tries to fill the fields of the IP header before it
+ * is sent to the L3 infra node.
+ *
+ * Argument: cnat_nfv9_logging_info_t *nfv9_logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_nfv9_fill_nfv9_ip_header (cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ u16 new_record_length = 0;
+ u16 orig_record_length = 0;
+ vlib_buffer_t * b0 = nfv9_logging_info->current_logging_context;
+
+ /*
+ * Fill in the IP header and port number of the Netflow collector
+ * The L3 Infra node will fill in the rest of the fields
+ */
+
+ nfv9_logging_info->nfv9_header->count =
+ clib_host_to_net_u16(nfv9_logging_info->total_record_count);
+
+ /*
+ * Pad the last add/del record to ensure multiple of 4 bytes
+ */
+
+ if(nfv9_logging_info->last_record != RECORD_INVALID) {
+
+ orig_record_length =
+ nfv9_logging_info->record_length[nfv9_logging_info->last_record];
+
+ new_record_length = cnat_nfv9_pad_end_of_record_length(
+ orig_record_length);
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u16(new_record_length);
+ }
+
+ /*
+ * If the record is padded, ensure the padded bytes are ZERO
+ */
+ if (PREDICT_TRUE(new_record_length - orig_record_length)) {
+ u8 *pkt_ptr = (u8 *) (b0 + nfv9_logging_info->pkt_length);
+
+ /*
+ * Blindly copy 3 bytes of data to Zero to avoid for loops
+ * We have sufficient padding bytes for safety and we won't
+ * go over buffer limits
+ */
+ *pkt_ptr++ = 0;
+ *pkt_ptr++ = 0;
+ *pkt_ptr++ = 0;
+
+ nfv9_logging_info->pkt_length +=
+ (new_record_length - orig_record_length);
+ }
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+ fill_ip_n_udp_hdr(server->ipv4_address,
+ server->port, nfv9_logging_info);
+ /*
+ * It is important to set the sw_if_index for the new buffer create
+ */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
+
+}
+
+/*
+ * edt: * * cnat_nfv9_send_queued_pkt
+ *
+ * Tries to send a logging pkt that has been queued earlier
+ * because it could not be sent due to downstream constipation
+ *
+ * Argument: cnat_nfv9_logging_info_t *nfv9_logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_nfv9_send_queued_pkt (cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+}
+
+/*
+ * edt: * * cnat_nfv9_send_pkt
+ *
+ * Tries to send a logging pkt. If the packet cannot be sent
+ * because of rewrite_output node cannot process it, queue
+ * it temporarily and try to send it later.
+ *
+ * Argument: cnat_nfv9_logging_info_t *nfv9_logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_nfv9_send_pkt (cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ cnat_nfv9_fill_nfv9_ip_header(nfv9_logging_info);
+
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+
+ /* Update sequence number just before sending.
+ * So that, multiple NAT44/NAT64/DSLite instances sharing a
+ * a single server instance can stamp the sequence number
+ * in the right sequence (as seen by the server).
+ */
+ server->sequence_num += 1;
+ nfv9_logging_info->nfv9_header->sequence_num =
+ clib_host_to_net_u32(server->sequence_num);
+
+#if DEBUG
+ cnat_nfv9_dump_logging_context (2, nfv9_logging_info, 1);
+#endif
+#if 0 /* commented out below */
+ send_vpp3_nfv9_pkt(nfv9_logging_info);
+#endif
+ nfv9_logging_info->current_logging_context = NULL;
+ /*
+ * Increase last packet sent count
+ */
+ server->pkts_since_last_template++;
+
+ /*
+ * If we are sending an nfv9 tempate with this packet
+ * log this timestamp
+ */
+ if (nfv9_logging_info->nfv9_template_header) {
+ server->last_template_sent_time =
+ cnat_nfv9_get_unix_time_in_seconds();
+ server->pkts_since_last_template = 0;
+ }
+
+ return;
+}
+
+/*
+ * send_vpp3_nfv9_pkt: to send multiple b0 in a frame
+ */
+#if 0
+inline void send_vpp3_nfv9_pkt (cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ vlib_main_t *vm = vlib_get_main();
+ vlib_frame_t *f;
+ vlib_buffer_t *b0;
+ u32 ip4_input_node_index;
+ //u32 * to_next, * from, bi0 =0;
+ u32 bi0 =0;
+ ipv4_header * h0, *ip;
+ static u32 * buffers;
+ u32 nalloc;
+ udp_header_t * udp;
+ u16 udp_length, ip_length;
+
+
+ ip4_input_node_index = nfv9_logging_info->ip4_input_node_index;
+ f = nfv9_logging_info->f;
+ if (f == NULL) {
+ nfv9_logging_info->f = vlib_get_frame_to_node(vm, ip4_input_node_index);
+ f = nfv9_logging_info->f;
+ f->n_vectors = 0;
+ nfv9_logging_info->to_next = vlib_frame_vector_args (f);
+ }
+ /* Build a pkt from whole cloth */
+ b0 = nfv9_logging_info->current_logging_context;
+ //to_next = nfv9_logging_info->to_next;
+ ip = vlib_buffer_get_current (b0);
+ //if (PREDICT_TRUE(f->n_vectors < VLIB_FRAME_SIZE)) {
+ if (PREDICT_TRUE(f->n_vectors < 5)) {
+
+ b0->current_length = clib_net_to_host_u16(ip->total_len_bytes);
+ bi0 = vlib_get_buffer_index (vm, b0);
+ nfv9_logging_info->to_next[0] = bi0;
+ printf("f->n_vec %d f %p to_next %p val %d b0 %p\n",
+ f->n_vectors, f, nfv9_logging_info->to_next,
+ nfv9_logging_info->to_next[0], b0);
+ //to_next++;
+ nfv9_logging_info->to_next++; // = to_next;
+ f->n_vectors++;
+ }
+
+ //if (f->n_vectors == VLIB_FRAME_SIZE)
+ if (f->n_vectors == 5) {
+ printf("sending pkt on 256\n");
+ printf("%s: total_len_bytes %d bi %d nfv9_logging_info->pkt_length %d index %d\n",
+ __func__, clib_net_to_host_u16(ip->total_len_bytes),
+ bi0, nfv9_logging_info->pkt_length, ip4_input_node_index);
+ vlib_put_frame_to_node(vm, ip4_input_node_index, f);
+ nfv9_logging_info->f = NULL;
+ nfv9_logging_info->to_next = NULL;
+ }
+ return;
+}
+#endif
+/*
+ * send_vpp3_nfv9_pkt: to send one b0 in a frame
+ */
+inline void send_vpp3_nfv9_pkt (cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ vlib_node_t *output_node;
+ vlib_main_t *vm = vlib_get_main();
+ vlib_frame_t *f;
+ vlib_buffer_t *b0;
+ u32 *to_next;
+ u32 bi=0;
+ ipv4_header *ip;
+
+ //Lets check and send it to ip4-lookup node
+ output_node = vlib_get_node_by_name (vm, (u8 *) "ip4-lookup");
+ f = vlib_get_frame_to_node (vm, output_node->index);
+
+ if ( nfv9_logging_info->current_logging_context != NULL) {
+ /* Build a pkt from whole cloth */
+ b0 = nfv9_logging_info->current_logging_context;
+ ip = vlib_buffer_get_current(b0);
+ to_next = vlib_frame_vector_args (f);
+ bi = vlib_get_buffer_index (vm, b0);
+ to_next[0] = bi;
+
+ f->n_vectors = 1;
+ b0->current_length = clib_net_to_host_u16(ip->total_len_bytes);
+ vlib_put_frame_to_node (vm, output_node->index, f);
+ }
+ return;
+}
+/*
+ * edt: * * cnat_nfv9_send_pkt_always_success
+ *
+ * Tries to send a logging pkt. This cannot fail due to downstream
+ * constipation because we have already checked if the rewrite_output
+ * node can accept it.
+ *
+ * Argument: cnat_nfv9_logging_info_t *nfv9_logging_info
+ * structure that contains the packet context
+ *
+ * Argument: vlib_node_t *output_node
+ * vlib_node_t structure for rewrite_output node
+ */
+inline
+void cnat_nfv9_send_pkt_always_success (
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ vlib_node_t *output_node)
+{
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+ vlib_main_t * vm = vlib_get_main();
+
+ /*
+ * At this point we either have a current or queued logging context
+ */
+ if (PREDICT_TRUE(nfv9_logging_info->current_logging_context != NULL)) {
+ server->sequence_num += 1;
+ nfv9_logging_info->nfv9_header->sequence_num =
+ clib_host_to_net_u32(server->sequence_num);
+ cnat_nfv9_fill_nfv9_ip_header(nfv9_logging_info);
+
+ nfv9_logging_info->current_logging_context->current_length =
+ nfv9_logging_info->pkt_length;
+ vlib_cli_output(vm, "\nNFV9: 3. Sending Current packet\n");
+#if DEBUG
+ cnat_nfv9_dump_logging_context (3, nfv9_logging_info, 1);
+#endif
+ send_vpp3_nfv9_pkt(nfv9_logging_info);
+ nfv9_logging_info->current_logging_context = NULL;
+ } else {
+ /*
+ * For queued logging context, nfv9_header-> count is already set
+ */
+ nfv9_logging_info->queued_logging_context->current_length =
+ nfv9_logging_info->pkt_length;
+ vlib_cli_output(vm,"\nNFV9: 4. Sending Queued packet\n");
+#if DEBUG
+ cnat_nfv9_dump_logging_context (4, nfv9_logging_info, 2);
+#endif
+ send_vpp3_nfv9_pkt(nfv9_logging_info);
+
+ nfv9_logging_info->queued_logging_context = NULL;
+ }
+
+ /*
+ * NF Logging info already deleted, just free it and return
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->deleted)) {
+ pool_put(cnat_nfv9_logging_info_pool, nfv9_logging_info);
+ return;
+ }
+
+ /*
+ * Increase last packet sent count and timestamp
+ */
+ server->pkts_since_last_template++;
+
+ /*
+ * If we are sending an nfv9 tempate with this packet
+ * log this timestamp
+ */
+ if (nfv9_logging_info->nfv9_template_header) {
+ server->last_template_sent_time =
+ cnat_nfv9_get_unix_time_in_seconds();
+ server->pkts_since_last_template = 0;
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_create_logging_context
+ *
+ * Tries to create a logging context with packet buffer
+ * to send a new logging packet
+ *
+ * Argument: cnat_nfv9_logging_info_t *nfv9_logging_info
+ * structure that contains the nfv9 logging info and will store
+ * the packet context as well.
+ */
+inline
+void cnat_nfv9_create_logging_context (
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_nfv9_template_add_flag_t template_flag)
+{
+ vlib_main_t *vm = vlib_get_main();
+ vlib_buffer_t *b0;
+ static u32 bi;
+ u8 i;
+
+ /*
+ * If queued_logging_context_index is non-EMPTY, we already have a logging
+ * packet queued to be sent. First try sending this before allocating
+ * a new context. We can have only one active packet context per
+ * nfv9_logging_info structure
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->queued_logging_context != NULL)) {
+ cnat_nfv9_send_queued_pkt(nfv9_logging_info);
+ /*
+ * If we cannot still send the queued pkt, just return
+ * Downstream Constipation count would have increased anyway
+ */
+ if (nfv9_logging_info->queued_logging_context != NULL) {
+ cnat_global_counters.nfv9_logging_context_creation_deferred_count++;
+ return;
+ }
+ }
+
+
+ /*
+ * No context can be allocated, return silently
+ * calling routine will handle updating the error counters
+ */
+ if (vlib_buffer_alloc (vm, &bi, 1) != 1) {
+ vlib_cli_output(vm, "buffer allocation failure");
+ return;
+ }
+ /* Build a pkt from whole cloth */
+ b0 = vlib_get_buffer (vm, bi);
+ b0->current_data = 0;
+
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+
+ nfv9_logging_info->current_logging_context = b0;
+ nfv9_logging_info->current_logging_context_timestamp =
+ cnat_nfv9_get_sys_up_time_in_ms();
+
+
+ nfv9_logging_info->nfv9_header =
+ (nfv9_header_t *) (vlib_buffer_get_current(b0) +
+ (sizeof(ipv4_header)) +
+ (sizeof(udp_hdr_type_t)));
+
+ nfv9_logging_info->nfv9_header->version =
+ clib_host_to_net_u16(CNAT_NFV9_VERSION_NUMBER);
+
+ nfv9_logging_info->nfv9_header->sys_up_time =
+ clib_host_to_net_u32(cnat_nfv9_get_sys_up_time_in_ms());
+
+ nfv9_logging_info->nfv9_header->timestamp =
+ clib_host_to_net_u32(cnat_nfv9_get_unix_time_in_seconds());
+
+
+ nfv9_logging_info->nfv9_header->source_id =
+ clib_host_to_net_u32(nfv9_src_id);
+
+ nfv9_logging_info->dataflow_header = 0;
+
+ for(i = 0; i < MAX_RECORDS;i++) {
+ nfv9_logging_info->record[i] = NULL;
+ nfv9_logging_info->record_length[i] = 0;
+ }
+ nfv9_logging_info->last_record = 0;
+
+
+ nfv9_logging_info->nfv9_template_header = 0;
+ nfv9_logging_info->next_data_ptr =
+ (u8 *) (vlib_buffer_get_current(b0) +
+ sizeof(ipv4_header) + sizeof(udp_hdr_type_t) +
+ sizeof(nfv9_header_t));
+
+ nfv9_logging_info->pkt_length = (CNAT_NFV9_TEMPLATE_OFFSET -
+ CNAT_NFV9_IP_HDR_OFFSET);
+
+
+ /*
+ * Now we have 0 records to start with
+ */
+
+ nfv9_logging_info->total_record_count = 0;
+
+ if ((template_flag == cnat_nfv9_template_add_always) ||
+ (server->pkts_since_last_template >
+ server->refresh_rate) ||
+ ((cnat_nfv9_get_unix_time_in_seconds() -
+ server->last_template_sent_time) >
+ server->timeout_rate)) {
+
+ /*
+ * Send a new template
+ */
+ nfv9_logging_info->nfv9_template_header =
+ (cnat_nfv9_template_t *) nfv9_logging_info->next_data_ptr;
+
+ memcpy(nfv9_logging_info->nfv9_template_header,
+ &cnat_nfv9_template_info,
+ sizeof(cnat_nfv9_template_info));
+
+ /*
+ * Templates are sent irrespective of particular service-type config
+ */
+ nfv9_logging_info->total_record_count = MAX_RECORDS - 1;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_TEMPLATE_LENGTH;
+
+ /*
+ * Set the data pointer beyond the template field
+ */
+ nfv9_logging_info->next_data_ptr =
+ (u8 *) (nfv9_logging_info->nfv9_template_header + 1);
+ /*
+ * Setting template_sent flag as TRUE. this will be checked in
+ * handle_vrfid_name_mapping()
+ */
+ server->template_sent = TEMPLATE_SENT_TRUE;
+ }
+}
+
+inline
+void cnat_nfv9_record_create (
+ cnat_nfv9_logging_info_t *nfv9_logging_info, u16 cur_record)
+{
+ int byte_diff = 0;
+ u16 last_record = nfv9_logging_info->last_record;
+
+ if(last_record != 0 && last_record != cur_record) {
+ u16 orig_length, new_length;
+
+ orig_length = nfv9_logging_info->record_length[last_record];
+ new_length = cnat_nfv9_pad_end_of_record_length(orig_length);
+
+ /*
+ * The padding bytes are required after the last record
+ * Ensure length of last record accounts for padding bytes
+ */
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u16(new_length);
+
+ /*
+ * Since we are working on the del record, set add record to 0
+ */
+ nfv9_logging_info->record[last_record] = 0;
+
+ nfv9_logging_info->record_length[last_record] = 0;
+
+ nfv9_logging_info->last_record = 0;
+ }
+
+ nfv9_logging_info->last_record = cur_record;
+
+ /*
+ * The padding bytes are required after the last record
+ * Ensure that we skip over the padding bytes
+ */
+ nfv9_logging_info->dataflow_header = (nfv9_dataflow_record_header_t *)
+ cnat_nfv9_add_end_of_record_padding(nfv9_logging_info->next_data_ptr);
+ /*
+ * Get the difference
+ */
+ byte_diff = cnat_nfv9_pad_added_to_an_addr(
+ (u8 *)nfv9_logging_info->dataflow_header,
+ nfv9_logging_info->next_data_ptr);
+ if(byte_diff > 0) {
+ /*
+ * Update the packet length to account for the pad bytes
+ */
+ nfv9_logging_info->pkt_length += byte_diff;
+ u8 *pkt_ptr = nfv9_logging_info->next_data_ptr;
+
+ /*
+ * Blindly copy 3 bytes of data to Zero to avoid for loops
+ * We have sufficient padding bytes for safety and we won't
+ * go over buffer limits
+ */
+ *pkt_ptr++ = 0;
+ *pkt_ptr++ = 0;
+ *pkt_ptr++ = 0;
+ }
+ /*
+ * Initialize the template_id and the length of the add record
+ */
+ nfv9_logging_info->dataflow_header->dataflow_template_id =
+ clib_host_to_net_u16(cnat_template_id[cur_record]);
+
+ nfv9_logging_info->record[cur_record] =
+ (u8 *) (nfv9_logging_info->dataflow_header + 1);
+
+ nfv9_logging_info->record_length[cur_record] =
+ CNAT_NFV9_DATAFLOW_RECORD_HEADER_LENGTH;
+
+ /*
+ * Update the length of the total NFV9 record
+ */
+ nfv9_logging_info->pkt_length +=
+ CNAT_NFV9_DATAFLOW_RECORD_HEADER_LENGTH;
+
+ /*
+ * Set the data pointer beyond the dataflow header field
+ */
+ nfv9_logging_info->next_data_ptr =
+ (u8 *) (nfv9_logging_info->dataflow_header + 1);
+
+}
+
+static void cnat_nfv9_insert_add_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap)
+{
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_add_record_t nfv9_logging_add_record;
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT44_ADD_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_ADD_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_add_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+
+ nfv9_logging_add_record.outside_vrf_id =
+ clib_host_to_net_u32(vrfmap->o_vrf_id);
+
+ nfv9_logging_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ nfv9_logging_add_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ nfv9_logging_add_record.outside_ip_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : GRE_PROT)));
+
+ nfv9_logging_add_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[NAT44_ADD_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_ADD_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_ADD_RECORD]
+ += CNAT_NFV9_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->record[NAT44_ADD_RECORD]
+ += CNAT_NFV9_ADD_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_ADD_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_ADD_RECORD]);
+
+}
+
+
+static void cnat_nfv9_ds_lite_insert_add_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry)
+{
+
+ nfv9_ds_lite_add_record_t nfv9_logging_add_record = {0};
+ cnat_user_db_entry_t *udb = NULL;
+ u16 my_proto_mask;
+ u8 my_protocol;
+
+ udb = cnat_user_db + db->user_index;
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_ADD_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_ADD_RECORD);
+ }
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ nfv9_logging_add_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+ nfv9_logging_add_record.outside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->o_vrf_id);
+
+#ifdef DSLITE_USER_IPV4
+ nfv9_logging_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+#else
+ /*
+ * Inside ipv4 address is sent as 0.0.0.0 for ds-lite case as
+ * ipv6 is user here.
+ */
+ nfv9_logging_add_record.inside_ip_addr = 0;
+#endif
+
+ nfv9_logging_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_add_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ nfv9_logging_add_record.outside_ip_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : 0)));
+ nfv9_logging_add_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[DS_LITE_ADD_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_DS_LITE_ADD_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[DS_LITE_ADD_RECORD]
+ += CNAT_NFV9_DS_LITE_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DS_LITE_ADD_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[DS_LITE_ADD_RECORD]
+ += CNAT_NFV9_DS_LITE_ADD_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_ADD_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_ADD_RECORD]);
+}
+
+
+static void cnat_nfv9_ds_lite_insert_del_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry)
+{
+
+ nfv9_ds_lite_del_record_t nfv9_logging_del_record = {0};
+ cnat_user_db_entry_t *udb = NULL;
+ u16 my_proto_mask;
+ u8 my_protocol;
+
+ udb = cnat_user_db + db->user_index;
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_DEL_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_DEL_RECORD);
+ }
+ /*
+ * We should definitely have a del record now.
+ * No need to sanitize
+ */
+ nfv9_logging_del_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+
+#ifdef DSLITE_USER_IPV4
+ nfv9_logging_del_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+#else
+ /*
+ * Inside ipv4 address is sent as 0.0.0.0 for ds-lite case as
+ * ipv6 is user here.
+ */
+ nfv9_logging_del_record.inside_ip_addr = 0;
+#endif
+
+ nfv9_logging_del_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_del_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_del_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_del_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_del_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : 0)));
+ nfv9_logging_del_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[DS_LITE_DEL_RECORD],
+ &nfv9_logging_del_record, CNAT_NFV9_DS_LITE_DEL_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[DS_LITE_DEL_RECORD] +=
+ CNAT_NFV9_DS_LITE_DEL_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DS_LITE_DEL_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[DS_LITE_DEL_RECORD]
+ += CNAT_NFV9_DS_LITE_DEL_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_DEL_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_DEL_RECORD]);
+}
+
+#ifndef NO_BULK_LOGGING
+static void cnat_nfv9_insert_bulk_add_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap,
+ int bulk_alloc_start_port)
+{
+ nfv9_bulk_add_record_t nfv9_logging_bulk_add_record;
+ bulk_alloc_size_t bulk_size = BULKSIZE_FROM_VRFMAP(vrfmap);
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT44_BULK_ADD_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_BULK_ADD_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_bulk_add_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+ nfv9_logging_bulk_add_record.outside_vrf_id =
+ clib_host_to_net_u32(vrfmap->o_vrf_id);
+
+ nfv9_logging_bulk_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ nfv9_logging_bulk_add_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ nfv9_logging_bulk_add_record.outside_ip_port_start =
+ clib_host_to_net_u16(bulk_alloc_start_port);
+ nfv9_logging_bulk_add_record.outside_ip_port_end =
+ clib_host_to_net_u16(bulk_alloc_start_port + bulk_size -1);
+
+ memcpy(nfv9_logging_info->record[NAT44_BULK_ADD_RECORD],
+ &nfv9_logging_bulk_add_record, CNAT_NFV9_BULK_ADD_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_BULK_ADD_RECORD]
+ += CNAT_NFV9_BULK_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_BULK_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->record[NAT44_BULK_ADD_RECORD]
+ += CNAT_NFV9_BULK_ADD_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_BULK_ADD_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_BULK_ADD_RECORD]);
+
+}
+
+
+static void cnat_nfv9_ds_lite_insert_bulk_add_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ int bulk_alloc_start_port)
+{
+
+ nfv9_ds_lite_bulk_add_record_t nfv9_logging_bulk_add_record = {0};
+ cnat_user_db_entry_t *udb = NULL;
+ bulk_alloc_size_t bulk_size = BULKSIZE_FROM_VRFMAP(dslite_entry);
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_BULK_ADD_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_BULK_ADD_RECORD);
+ }
+ udb = cnat_user_db + db->user_index;
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_bulk_add_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+ nfv9_logging_bulk_add_record.outside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->o_vrf_id);
+
+#ifdef DSLITE_USER_IPV4
+ nfv9_logging_bulk_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+#else
+ /*
+ * Inside ipv4 address is sent as 0.0.0.0 for ds-lite case as
+ * ipv6 is user here.
+ */
+ nfv9_logging_bulk_add_record.inside_ip_addr = 0;
+#endif
+
+ nfv9_logging_bulk_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_bulk_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_bulk_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_bulk_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_bulk_add_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ nfv9_logging_bulk_add_record.outside_ip_port_start =
+ clib_host_to_net_u16(bulk_alloc_start_port);
+ nfv9_logging_bulk_add_record.outside_ip_port_end =
+ clib_host_to_net_u16(bulk_alloc_start_port + bulk_size -1);
+
+ memcpy(nfv9_logging_info->record[DS_LITE_BULK_ADD_RECORD],
+ &nfv9_logging_bulk_add_record, CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[DS_LITE_BULK_ADD_RECORD]
+ += CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+ nfv9_logging_info->record[DS_LITE_BULK_ADD_RECORD]
+ += CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_BULK_ADD_RECORD];
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_BULK_ADD_RECORD]);
+}
+
+
+static void cnat_nfv9_ds_lite_insert_bulk_del_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ int bulk_alloc_start_port)
+{
+
+ nfv9_ds_lite_bulk_del_record_t nfv9_logging_bulk_del_record = {0};
+ cnat_user_db_entry_t *udb = NULL;
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_BULK_DEL_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_BULK_DEL_RECORD);
+ }
+ udb = cnat_user_db + db->user_index;
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_bulk_del_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+
+#ifdef DSLITE_USER_IPV4
+ nfv9_logging_bulk_del_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+#else
+ nfv9_logging_bulk_del_record.inside_ip_addr =
+ clib_host_to_net_u32(0);
+#endif
+
+ nfv9_logging_bulk_del_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_bulk_del_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_bulk_del_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_bulk_del_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_bulk_del_record.outside_ip_port_start =
+ clib_host_to_net_u16(bulk_alloc_start_port);
+
+ memcpy(nfv9_logging_info->record[DS_LITE_BULK_DEL_RECORD],
+ &nfv9_logging_bulk_del_record,
+ CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_LENGTH);
+ nfv9_logging_info->record_length[DS_LITE_BULK_DEL_RECORD] +=
+ CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length +=
+ CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+ nfv9_logging_info->record[DS_LITE_BULK_DEL_RECORD] +=
+ CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_BULK_DEL_RECORD];
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_BULK_DEL_RECORD]);
+}
+#endif /* #ifndef NO_BULK_LOGGING */
+
+static void cnat_nfv9_insert_del_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap)
+{
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_del_record_t nfv9_logging_del_record;
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT44_DEL_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_DEL_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_del_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+
+ nfv9_logging_del_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ nfv9_logging_del_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : GRE_PROT)));
+
+ nfv9_logging_del_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[NAT44_DEL_RECORD],
+ &nfv9_logging_del_record, CNAT_NFV9_DEL_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_DEL_RECORD]
+ += CNAT_NFV9_DEL_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DEL_RECORD_LENGTH;
+
+ nfv9_logging_info->record[NAT44_DEL_RECORD]
+ += CNAT_NFV9_DEL_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_DEL_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_DEL_RECORD]);
+
+}
+
+#ifndef NO_BULK_LOGGING
+static void cnat_nfv9_insert_bulk_del_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info,
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap,
+ int bulk_alloc_start_port)
+{
+ nfv9_bulk_del_record_t nfv9_logging_bulk_del_record;
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT44_BULK_DEL_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_BULK_DEL_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+
+ nfv9_logging_bulk_del_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+
+ nfv9_logging_bulk_del_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ nfv9_logging_bulk_del_record.outside_ip_port_start =
+ clib_host_to_net_u16(bulk_alloc_start_port);
+
+ memcpy(nfv9_logging_info->record[NAT44_BULK_DEL_RECORD],
+ &nfv9_logging_bulk_del_record, CNAT_NFV9_BULK_DEL_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_BULK_DEL_RECORD]
+ += CNAT_NFV9_BULK_DEL_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_BULK_DEL_RECORD_LENGTH;
+
+ nfv9_logging_info->record[NAT44_BULK_DEL_RECORD]
+ += CNAT_NFV9_BULK_DEL_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_BULK_DEL_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_BULK_DEL_RECORD]);
+
+}
+
+#endif /* #ifndef NO_BULK_LOGGING */
+/*
+ * edt: * * cnat_nfv9_log_mapping_create
+ *
+ * Tries to log a creation of mapping record
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ *
+ * Argument: cnat_vrfmap_t *vrfmap
+ * VRF Map for the Main DB entry being created
+ */
+void cnat_nfv9_log_mapping_create (cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ vlib_main_t * vm = vlib_get_main();
+
+ if (PREDICT_FALSE(vrfmap->nfv9_logging_index == EMPTY)) {
+
+ //vlib_cli_output(vm, "\n1. Log Mapping failed");
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ if (cnat_nfv9_logging_info_pool == NULL) {
+ vlib_cli_output(vm, "%s: info_pool pointer is NULL !!!!\n", __func__);
+ return;
+ }
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + vrfmap->nfv9_logging_index;
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ //vlib_cli_output(vm, "\n2. Log Mapping failed");
+ return;
+ }
+
+ }
+
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ cnat_nfv9_insert_bulk_add_record(nfv9_logging_info, db, vrfmap,
+ bulk_alloc);
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED)
+ return; /* No logging required.. bulk port usage */
+ else /* Individual logging .. fall back to old method */
+#endif
+ cnat_nfv9_insert_add_record(nfv9_logging_info, db, vrfmap);
+
+ nfv9_logging_info->total_record_count += 1;
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_log_mapping_delete
+ *
+ * Tries to log a deletion of mapping record
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being deleted
+ *
+ * Argument: cnat_vrfmap_t *vrfmap
+ * VRF Map for the Main DB entry being deleted
+ */
+void cnat_nfv9_log_mapping_delete (cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+
+ if (PREDICT_FALSE(vrfmap->nfv9_logging_index == EMPTY)) {
+ //vlib_cli_output(vm, "\n3. Log Mapping failed");
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + vrfmap->nfv9_logging_index;
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ //vlib_cli_output(vm, "\n4. Log Mapping failed");
+ return;
+ }
+ }
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ cnat_nfv9_insert_bulk_del_record(nfv9_logging_info, db, vrfmap,
+ bulk_alloc);
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED)
+ return; /* No logging required.. bulk port usage */
+ else /* Individual logging .. fall back to old method */
+#endif
+ cnat_nfv9_insert_del_record(nfv9_logging_info, db, vrfmap);
+
+ nfv9_logging_info->total_record_count += 1;
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+
+/* NAT64 Related routines */
+
+/*
+ * edt: * * cnat_nfv9_bib_mapping_create
+ *
+ * Tries to log a creation of Bib mapping record
+ *
+ * Argument: nat64_bib_entry_t *db
+ * BIB DB entry being created
+ *
+ * Argument: nat64_table_entry_t *nat64_entry
+ * NAT64 Instance where this BIB belongs
+ */
+void cnat_nfv9_bib_mapping_create (nat64_bib_entry_t *db,
+ nat64_table_entry_t *nat64_entry)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_nat64_add_bib_record_t nfv9_logging_add_record;
+
+ if (PREDICT_FALSE(nat64_entry->logging_index == EMPTY)) {
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + nat64_entry->logging_index;
+
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT64_ADD_BIB_RECORD] == NULL)){
+ cnat_nfv9_record_create(nfv9_logging_info,NAT64_ADD_BIB_RECORD);
+ }
+
+
+ nfv9_logging_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[0]);
+ nfv9_logging_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[1]);
+ nfv9_logging_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[2]);
+ nfv9_logging_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[3]);
+
+
+ nfv9_logging_add_record.outside_v4_src_addr =
+ clib_host_to_net_u32(db->v4_out_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_src_port =
+ clib_host_to_net_u16(db->v6_in_key.port);
+ nfv9_logging_add_record.outside_src_port =
+ clib_host_to_net_u16(db->v4_out_key.k.port);
+
+ my_proto_mask = db->v6_in_key.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? IPV6_PROTO_ICMPV6 : 0)));
+ nfv9_logging_add_record.protocol = my_protocol;
+
+
+ memcpy(nfv9_logging_info->record[NAT64_ADD_BIB_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_NAT64_ADD_BIB_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT64_ADD_BIB_RECORD] +=
+ CNAT_NFV9_NAT64_ADD_BIB_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT64_ADD_BIB_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[NAT64_ADD_BIB_RECORD]
+ += CNAT_NFV9_NAT64_ADD_BIB_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT64_ADD_BIB_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT64_ADD_BIB_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+
+/*
+ * edt: * * cnat_nfv9_session_mapping_create
+ *
+ * Tries to log a creation of Bib mapping record
+ *
+ * Argument: nat64_bib_entry_t *bdb
+ * BIB DB entry for the session that is created
+ *
+ * Argument: nat64_session_entry_t *sdb
+ * Session DB entry being created
+ *
+ * Argument: nat64_table_entry_t *nat64_entry
+ * NAT64 Instance where this BIB and Session belongs
+ */
+void cnat_nfv9_session_mapping_create (nat64_bib_entry_t *bdb,
+ nat64_session_entry_t *sdb,
+ nat64_table_entry_t *nat64_entry_ptr)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ u32 dest_v6[4];
+ nfv9_nat64_add_session_record_t nfv9_logging_add_record;
+ u8 *ipv6_addr_ptr;
+ u8 *ipv4_addr_ptr;
+
+
+ if (PREDICT_FALSE(nat64_entry_ptr->logging_index == EMPTY)) {
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + nat64_entry_ptr->logging_index;
+
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)){
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT64_ADD_SESSION_RECORD]
+ == NULL)){
+ cnat_nfv9_record_create(nfv9_logging_info, NAT64_ADD_SESSION_RECORD);
+ }
+
+
+ nfv9_logging_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[0]);
+ nfv9_logging_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[1]);
+ nfv9_logging_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[2]);
+ nfv9_logging_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[3]);
+
+
+ nfv9_logging_add_record.outside_v4_src_addr =
+ clib_host_to_net_u32(bdb->v4_out_key.k.ipv4);
+
+
+ nfv9_logging_add_record.outside_v4_dest_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+
+ /* Need to create the V6 address using prefix */
+ dest_v6[0] = nat64_entry_ptr->v6_prefix[0];
+ dest_v6[1] = nat64_entry_ptr->v6_prefix[1];
+ dest_v6[2] = nat64_entry_ptr->v6_prefix[2];
+ dest_v6[3] = nat64_entry_ptr->v6_prefix[3];
+
+ ipv6_addr_ptr = (u8 *) (&(dest_v6[0]));
+ ipv4_addr_ptr = (u8 *) (&(sdb->v4_dest_key.k.ipv4));
+
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet0_position) = *(ipv4_addr_ptr);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet1_position) = *(ipv4_addr_ptr + 1);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet2_position) = *(ipv4_addr_ptr + 2);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet3_position) = *(ipv4_addr_ptr + 3);
+
+ nfv9_logging_add_record.inside_v6_dest_addr[0] =
+ clib_host_to_net_u32(dest_v6[0]);
+ nfv9_logging_add_record.inside_v6_dest_addr[1] =
+ clib_host_to_net_u32(dest_v6[1]);
+ nfv9_logging_add_record.inside_v6_dest_addr[2] =
+ clib_host_to_net_u32(dest_v6[2]);
+ nfv9_logging_add_record.inside_v6_dest_addr[3] =
+ clib_host_to_net_u32(dest_v6[3]);
+
+ nfv9_logging_add_record.outside_v4_dest_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_src_port =
+ clib_host_to_net_u16(bdb->v6_in_key.port);
+ nfv9_logging_add_record.outside_src_port =
+ clib_host_to_net_u16(bdb->v4_out_key.k.port);
+
+ nfv9_logging_add_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+
+
+ my_proto_mask = bdb->v6_in_key.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? IPV6_PROTO_ICMPV6 : 0)));
+ nfv9_logging_add_record.protocol = my_protocol;
+
+
+ memcpy(nfv9_logging_info->record[NAT64_ADD_SESSION_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_NAT64_ADD_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT64_ADD_SESSION_RECORD] +=
+ CNAT_NFV9_NAT64_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT64_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[NAT64_ADD_SESSION_RECORD]
+ += CNAT_NFV9_NAT64_ADD_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT64_ADD_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT64_ADD_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+
+/*
+ * edt: * * cnat_nfv9_bib_mapping_delete
+ *
+ * Tries to log a deletion of Bib mapping record
+ *
+ * Argument: nat64_bib_entry_t *db
+ * BIB DB entry being created
+ *
+ * Argument: nat64_table_entry_t *nat64_entry
+ * NAT64 Instance where this BIB belongs
+ */
+void cnat_nfv9_bib_mapping_delete (nat64_bib_entry_t *db,
+ nat64_table_entry_t *nat64_entry)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_nat64_del_bib_record_t nfv9_logging_del_record;
+ if (PREDICT_FALSE(nat64_entry->logging_index == EMPTY)) {
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + nat64_entry->logging_index;
+
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)){
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT64_DEL_BIB_RECORD] == NULL)){
+ cnat_nfv9_record_create(nfv9_logging_info,NAT64_DEL_BIB_RECORD);
+ }
+
+
+ nfv9_logging_del_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[0]);
+ nfv9_logging_del_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[1]);
+ nfv9_logging_del_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[2]);
+ nfv9_logging_del_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(db->v6_in_key.ipv6[3]);
+
+
+ nfv9_logging_del_record.inside_src_port =
+ clib_host_to_net_u16(db->v6_in_key.port);
+
+ my_proto_mask = db->v6_in_key.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? IPV6_PROTO_ICMPV6 : 0)));
+ nfv9_logging_del_record.protocol = my_protocol;
+
+
+ memcpy(nfv9_logging_info->record[NAT64_DEL_BIB_RECORD],
+ &nfv9_logging_del_record, CNAT_NFV9_NAT64_DEL_BIB_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT64_DEL_BIB_RECORD] +=
+ CNAT_NFV9_NAT64_DEL_BIB_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT64_DEL_BIB_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[NAT64_DEL_BIB_RECORD]
+ += CNAT_NFV9_NAT64_DEL_BIB_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT64_DEL_BIB_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT64_DEL_BIB_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+
+/*
+ * edt: * * cnat_nfv9_session_mapping_delete
+ *
+ * Tries to log a deletion of Bib mapping record
+ *
+ * Argument: nat64_bib_entry_t *bdb
+ * BIB DB entry for the session that is created
+ *
+ * Argument: nat64_session_entry_t *sdb
+ * Session DB entry being created
+ *
+ * Argument: nat64_table_entry_t *nat64_entry
+ * NAT64 Instance where this BIB and Session belongs
+ */
+void cnat_nfv9_session_mapping_delete (nat64_bib_entry_t *bdb,
+ nat64_session_entry_t *sdb,
+ nat64_table_entry_t *nat64_entry_ptr)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ u32 dest_v6[4];
+ nfv9_nat64_del_session_record_t nfv9_logging_del_record;
+ u8 *ipv6_addr_ptr;
+ u8 *ipv4_addr_ptr;
+
+ if (PREDICT_FALSE(nat64_entry_ptr->logging_index == EMPTY)) {
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + nat64_entry_ptr->logging_index;
+
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)){
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[NAT64_DEL_SESSION_RECORD]
+ == NULL)){
+ cnat_nfv9_record_create(nfv9_logging_info, NAT64_DEL_SESSION_RECORD);
+ }
+
+
+ nfv9_logging_del_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[0]);
+ nfv9_logging_del_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[1]);
+ nfv9_logging_del_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[2]);
+ nfv9_logging_del_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(bdb->v6_in_key.ipv6[3]);
+
+ /* Need to create the V6 address using prefix */
+ dest_v6[0] = nat64_entry_ptr->v6_prefix[0];
+ dest_v6[1] = nat64_entry_ptr->v6_prefix[1];
+ dest_v6[2] = nat64_entry_ptr->v6_prefix[2];
+ dest_v6[3] = nat64_entry_ptr->v6_prefix[3];
+
+ ipv6_addr_ptr = (u8 *) (&(dest_v6[0]));
+ ipv4_addr_ptr = (u8 *) (&(sdb->v4_dest_key.k.ipv4));
+
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet0_position) = *(ipv4_addr_ptr);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet1_position) = *(ipv4_addr_ptr + 1);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet2_position) = *(ipv4_addr_ptr + 2);
+ *(ipv6_addr_ptr + nat64_entry_ptr->octet3_position) = *(ipv4_addr_ptr + 3);
+
+ nfv9_logging_del_record.inside_v6_dest_addr[0] =
+ clib_host_to_net_u32(dest_v6[0]);
+ nfv9_logging_del_record.inside_v6_dest_addr[1] =
+ clib_host_to_net_u32(dest_v6[1]);
+ nfv9_logging_del_record.inside_v6_dest_addr[2] =
+ clib_host_to_net_u32(dest_v6[2]);
+ nfv9_logging_del_record.inside_v6_dest_addr[3] =
+ clib_host_to_net_u32(dest_v6[3]);
+
+ nfv9_logging_del_record.inside_src_port =
+ clib_host_to_net_u16(bdb->v6_in_key.port);
+
+ nfv9_logging_del_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+
+
+ my_proto_mask = bdb->v6_in_key.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? IPV6_PROTO_ICMPV6 : 0)));
+ nfv9_logging_del_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[NAT64_DEL_SESSION_RECORD],
+ &nfv9_logging_del_record, CNAT_NFV9_NAT64_DEL_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT64_DEL_SESSION_RECORD] +=
+ CNAT_NFV9_NAT64_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT64_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[NAT64_DEL_SESSION_RECORD]
+ += CNAT_NFV9_NAT64_DEL_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT64_DEL_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT64_DEL_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_nat44_log_session_create
+ *
+ * Tries to log a creation of mapping record (session based)
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ * Arugment: cnat_session_entry_t *sdb
+ * Session DB entry if the destination is not the first dest
+ * Argument: cnat_vrfmap_t *vrfmap
+ * VRF Map for the Main DB entry being created
+ */
+
+void cnat_nfv9_nat44_log_session_create(cnat_main_db_entry_t *db,
+ cnat_session_entry_t *sdb,
+ cnat_vrfmap_t *vrfmap)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_add_session_record_t nfv9_logging_add_session_record;
+
+ if (PREDICT_FALSE(vrfmap->nfv9_logging_index == EMPTY)) {
+ //vlib_cli_output(vm,"\n1. Log Mapping failed");
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + vrfmap->nfv9_logging_index;
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ //vlib_cli_output(vm,"\n2. Log Mapping failed");
+ return;
+ }
+ }
+
+ if(PREDICT_FALSE(nfv9_logging_info->record[
+ NAT44_ADD_SESSION_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_ADD_SESSION_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ nfv9_logging_add_session_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+ nfv9_logging_add_session_record.outside_vrf_id =
+ clib_host_to_net_u32(vrfmap->o_vrf_id);
+
+ nfv9_logging_add_session_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+ nfv9_logging_add_session_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ /* If sdb is null, it is assumed that logging is being done
+ * for the first destination which is held in the main db
+
+ * itself
+ */
+ if(PREDICT_TRUE(sdb == NULL)) {
+ nfv9_logging_add_session_record.dest_ip_addr =
+ clib_host_to_net_u32(db->dst_ipv4);
+ nfv9_logging_add_session_record.dest_port =
+ clib_host_to_net_u16(db->dst_port);
+ } else {
+ nfv9_logging_add_session_record.dest_ip_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+ nfv9_logging_add_session_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+ }
+
+ nfv9_logging_add_session_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ nfv9_logging_add_session_record.outside_ip_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : GRE_PROT)));
+ nfv9_logging_add_session_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[NAT44_ADD_SESSION_RECORD],
+ &nfv9_logging_add_session_record,
+ CNAT_NFV9_NAT44_ADD_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_ADD_SESSION_RECORD]
+ += CNAT_NFV9_NAT44_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT44_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+
+ nfv9_logging_info->record[NAT44_ADD_SESSION_RECORD]
+ += CNAT_NFV9_NAT44_ADD_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_ADD_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_ADD_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_nat44_log_session_delete
+ *
+ * Tries to log a deletion of mapping record (session based)
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ * Arugment: cnat_session_entry_t *sdb
+ * Session DB entry if the destination is not the first dest
+ * Argument: cnat_vrfmap_t *vrfmap
+ * VRF Map for the Main DB entry being deleted
+ */
+
+void cnat_nfv9_nat44_log_session_delete(cnat_main_db_entry_t *db,
+ cnat_session_entry_t *sdb,
+ cnat_vrfmap_t *vrfmap)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ nfv9_del_session_record_t nfv9_logging_del_session_record;
+
+ if (PREDICT_FALSE(vrfmap->nfv9_logging_index == EMPTY)) {
+ //vlib_cli_output(vm, "\n1. Log Mapping failed");
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + vrfmap->nfv9_logging_index;
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ //vlib_cli_output(vm, "\n2. Log Mapping failed");
+ return;
+ }
+ }
+
+ if(PREDICT_FALSE(nfv9_logging_info->record[
+ NAT44_DEL_SESSION_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, NAT44_DEL_SESSION_RECORD);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ nfv9_logging_del_session_record.inside_vrf_id =
+ clib_host_to_net_u32(vrfmap->i_vrf_id);
+
+ nfv9_logging_del_session_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ /* If sdb is null, it is assumed that logging is being done
+ * for the first destination which is held in the main db
+ * itself
+ */
+ if(PREDICT_TRUE(sdb == NULL)) {
+ nfv9_logging_del_session_record.dest_ip_addr =
+ clib_host_to_net_u32(db->dst_ipv4);
+ nfv9_logging_del_session_record.dest_port =
+ clib_host_to_net_u16(db->dst_port);
+ } else {
+ nfv9_logging_del_session_record.dest_ip_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+ nfv9_logging_del_session_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+ }
+
+ nfv9_logging_del_session_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : GRE_PROT)));
+
+ nfv9_logging_del_session_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[NAT44_DEL_SESSION_RECORD],
+ &nfv9_logging_del_session_record,
+ CNAT_NFV9_NAT44_DEL_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[NAT44_DEL_SESSION_RECORD]
+ += CNAT_NFV9_NAT44_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->pkt_length += CNAT_NFV9_NAT44_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[NAT44_DEL_SESSION_RECORD]
+ += CNAT_NFV9_NAT44_DEL_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[NAT44_DEL_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[NAT44_DEL_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * DS-Lite APIs for netflow logging
+ */
+
+/*
+ * edt: * * cnat_nfv9_ds_lite_mapping_create
+ *
+ * Tries to log a creation of mapping record
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ *
+ * Argument: dslite_table_entry_t *dslite_entry
+ * ds-lite instance for the Main DB entry being created
+ */
+void cnat_nfv9_ds_lite_mapping_create(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+
+ cnat_nfv9_logging_info_t *nfv9_logging_info = NULL;
+
+ if (PREDICT_FALSE(!(db && dslite_entry))) {
+ return;
+ }
+ if (PREDICT_FALSE(dslite_entry->nfv9_logging_index == EMPTY)) {
+ /*
+ * no logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + dslite_entry->nfv9_logging_index;
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ cnat_nfv9_ds_lite_insert_bulk_add_record(nfv9_logging_info,
+ db, dslite_entry, bulk_alloc);
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED)
+ return; /* No logging required.. bulk port usage */
+ else /* Individual logging .. fall back to old method */
+#endif /*NO_BULK_LOGGING*/
+ cnat_nfv9_ds_lite_insert_add_record(nfv9_logging_info, db, dslite_entry);
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_ds_lite_mapping_delete
+ *
+ * Tries to log a deletion of mapping record
+ *
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being deleted
+ *
+ * Argument: dslite_table_entry_t *dslite_entry
+ * ds-lite instance for the Main DB entry being deleted
+ */
+void cnat_nfv9_ds_lite_mapping_delete(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+
+ cnat_nfv9_logging_info_t *nfv9_logging_info = NULL;
+ if (PREDICT_FALSE(!(db && dslite_entry))) {
+ return;
+ }
+ if (PREDICT_FALSE(dslite_entry->nfv9_logging_index == EMPTY)) {
+ /*
+ * No logging configured, silently return
+ */
+ return;
+ }
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + dslite_entry->nfv9_logging_index;
+
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ cnat_nfv9_ds_lite_insert_bulk_del_record(nfv9_logging_info,
+ db, dslite_entry, bulk_alloc);
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED)
+ return; /* No logging required.. bulk port usage */
+ else /* Individual logging .. fall back to old method */
+#endif /*NO_BULK_LOGGING*/
+ cnat_nfv9_ds_lite_insert_del_record(nfv9_logging_info, db, dslite_entry);
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+}
+
+/*
+ * edt: * * cnat_nfv9_dslite_log_session_create
+ *
+ * Tries to log a creation of mapping record (session based)
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ * Arugment: cnat_session_entry_t *sdb
+ * Session DB entry if the destination is not the first dest
+ * Argument: dslite_table_entry_t *dslite_entry,
+ * dslite table entry for dslite instance
+ */
+
+void cnat_nfv9_ds_lite_log_session_create(
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t *sdb)
+{
+
+ nfv9_ds_lite_add_session_record_t nfv9_logging_add_record ;
+ cnat_user_db_entry_t *udb = NULL;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ cnat_nfv9_logging_info_t *nfv9_logging_info = 0;
+
+ if (PREDICT_FALSE(dslite_entry->nfv9_logging_index == EMPTY)) {
+ /*
+ * no logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + dslite_entry->nfv9_logging_index;
+ udb = cnat_user_db + db->user_index;
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+
+ udb = cnat_user_db + db->user_index;
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_ADD_SESSION_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_ADD_SESSION_RECORD);
+ }
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ nfv9_logging_add_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+ nfv9_logging_add_record.outside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->o_vrf_id);
+
+ nfv9_logging_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_add_record.outside_ip_addr =
+ clib_host_to_net_u32(db->out2in_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+ nfv9_logging_add_record.outside_ip_port =
+ clib_host_to_net_u16(db->out2in_key.k.port);
+
+ /* If sdb is null, it is assumed that logging is being done
+ * for the first destination which is held in the main db
+
+ * itself
+ */
+ if(PREDICT_TRUE(sdb == NULL)) {
+ nfv9_logging_add_record.dest_ip_addr =
+ clib_host_to_net_u32(db->dst_ipv4);
+ nfv9_logging_add_record.dest_port =
+ clib_host_to_net_u16(db->dst_port);
+ } else {
+ nfv9_logging_add_record.dest_ip_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+ nfv9_logging_add_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+ }
+
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : 0)));
+ nfv9_logging_add_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[DS_LITE_ADD_SESSION_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[DS_LITE_ADD_SESSION_RECORD]
+ += CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[DS_LITE_ADD_SESSION_RECORD]
+ += CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_ADD_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_ADD_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+
+}
+
+/*
+ * edt: * * cnat_nfv9_dslite_log_session_delete
+ *
+ * Tries to log a creation of mapping record (session based)
+ * Argument: cnat_main_db_entry_t *db
+ * Main DB entry being created
+ * Arugment: cnat_session_entry_t *sdb
+ * Session DB entry if the destination is not the first dest
+ * Argument: dslite_table_entry_t *dslite_entry,
+ * dslite table entry for dslite instance
+ */
+
+void cnat_nfv9_ds_lite_log_session_delete(
+ cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t *sdb)
+{
+
+ nfv9_ds_lite_del_session_record_t nfv9_logging_add_record = {0};
+ cnat_user_db_entry_t *udb = NULL;
+ u16 my_proto_mask;
+ u8 my_protocol;
+ cnat_nfv9_logging_info_t *nfv9_logging_info = NULL;
+
+ if (PREDICT_FALSE(dslite_entry->nfv9_logging_index == EMPTY)) {
+ /*
+ * no logging configured, silently return
+ */
+ return;
+ }
+
+ nfv9_logging_info =
+ cnat_nfv9_logging_info_pool + dslite_entry->nfv9_logging_index;
+ udb = cnat_user_db + db->user_index;
+
+ if (PREDICT_FALSE(!udb)) {
+ return;
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(nfv9_logging_info->record[DS_LITE_DEL_SESSION_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, DS_LITE_DEL_SESSION_RECORD);
+ }
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ nfv9_logging_add_record.inside_vrf_id =
+ clib_host_to_net_u32(dslite_entry->i_vrf_id);
+
+ nfv9_logging_add_record.inside_ip_addr =
+ clib_host_to_net_u32(db->in2out_key.k.ipv4);
+
+ nfv9_logging_add_record.inside_v6_src_addr[0] =
+ clib_host_to_net_u32(udb->ipv6[0]);
+ nfv9_logging_add_record.inside_v6_src_addr[1] =
+ clib_host_to_net_u32(udb->ipv6[1]);
+ nfv9_logging_add_record.inside_v6_src_addr[2] =
+ clib_host_to_net_u32(udb->ipv6[2]);
+ nfv9_logging_add_record.inside_v6_src_addr[3] =
+ clib_host_to_net_u32(udb->ipv6[3]);
+
+ nfv9_logging_add_record.inside_ip_port =
+ clib_host_to_net_u16(db->in2out_key.k.port);
+
+ /* If sdb is null, it is assumed that logging is being done
+ * for the first destination which is held in the main db
+ * itself
+ */
+ if(PREDICT_TRUE(sdb == NULL)) {
+ nfv9_logging_add_record.dest_ip_addr =
+ clib_host_to_net_u32(db->dst_ipv4);
+ nfv9_logging_add_record.dest_port =
+ clib_host_to_net_u16(db->dst_port);
+ } else {
+ nfv9_logging_add_record.dest_ip_addr =
+ clib_host_to_net_u32(sdb->v4_dest_key.k.ipv4);
+ nfv9_logging_add_record.dest_port =
+ clib_host_to_net_u16(sdb->v4_dest_key.k.port);
+ }
+
+
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+
+ my_protocol = ((my_proto_mask == CNAT_UDP) ? UDP_PROT :
+ ((my_proto_mask == CNAT_TCP) ? TCP_PROT :
+ ((my_proto_mask == CNAT_ICMP) ? ICMP_PROT : 0)));
+ nfv9_logging_add_record.protocol = my_protocol;
+
+ memcpy(nfv9_logging_info->record[DS_LITE_DEL_SESSION_RECORD],
+ &nfv9_logging_add_record, CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[DS_LITE_DEL_SESSION_RECORD]
+ += CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length += CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[DS_LITE_DEL_SESSION_RECORD]
+ += CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_LENGTH;
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[DS_LITE_DEL_SESSION_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[DS_LITE_DEL_SESSION_RECORD]);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+
+}
+
+
+/*
+ * netflow logging API for ingress vrf_id to name mapping
+ */
+
+/*
+ * edt: * * handle_vrfid_name_mapping
+ * It will search for valid natflow entry in netflow pool,
+ * once found one, will send all vrfid name mapping info
+ * using that entry
+ */
+
+
+inline
+void handle_vrfid_name_mapping(void)
+{
+ cnat_nfv9_logging_info_t *nfv9_logging_info = NULL;
+
+ pool_foreach (nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ if(PREDICT_FALSE(nfv9_logging_info == NULL)) {
+ continue;
+ }
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_logging_info->server_index;
+ if(server->template_sent == TEMPLATE_SENT_TRUE) {
+ cnat_nfv9_ingress_vrfid_name_mapping_create(nfv9_logging_info);
+ server->template_sent = TEMPLATE_SENT_FALSE;
+ }
+ }));
+}
+
+/*
+ * edt: * * cnat_nfv9_ingress_vrfid_name_mapping_create
+ *
+ * Tries to log vrfid-name mapping record
+ * Argument: netflow pointer
+ */
+
+
+void cnat_nfv9_ingress_vrfid_name_mapping_create(
+ cnat_nfv9_logging_info_t *nfv9_logging_info)
+{
+ u16 index = 0;
+
+ for (index = 0; index < MAX_VRFID; index++) {
+ if(vrfid_name_map[index].ref_count == 0) {
+ continue;
+ }
+ if (PREDICT_FALSE(
+ nfv9_logging_info->current_logging_context == NULL)) {
+ cnat_nfv9_create_logging_context(nfv9_logging_info,
+ cnat_nfv9_template_add_default);
+ }
+ cnat_nfv9_insert_ingress_vrfid_name_record(
+ nfv9_logging_info,index);
+ if (PREDICT_FALSE(nfv9_logging_info->pkt_length >
+ nfv9_logging_info->max_length_minus_max_record_size) ||
+ PREDICT_FALSE(index == MAX_VRFID - 1)) {
+ if (PREDICT_TRUE(nfv9_logging_info->current_logging_context
+ != NULL)) {
+ cnat_nfv9_send_pkt(nfv9_logging_info);
+ }
+ }
+ }/*for()*/
+ return;
+}
+
+static void cnat_nfv9_insert_ingress_vrfid_name_record(
+ cnat_nfv9_logging_info_t *nfv9_logging_info, u16 index)
+{
+ nfv9_ingress_vrfid_name_record_t nfv9_ingress_vrfid_name_record = {0};
+
+ if (PREDICT_FALSE(
+ nfv9_logging_info->record[INGRESS_VRF_ID_NAME_RECORD] == NULL)) {
+ cnat_nfv9_record_create(nfv9_logging_info, INGRESS_VRF_ID_NAME_RECORD);
+ }
+ nfv9_ingress_vrfid_name_record.ingress_vrf_id =
+ clib_host_to_net_u32(vrfid_name_map[index].vrf_id);
+
+ memcpy(nfv9_ingress_vrfid_name_record.ingress_vrf_name,
+ vrfid_name_map[index].vrf_name, NFV9_VRF_NAME_LEN);
+
+ memcpy(nfv9_logging_info->record[INGRESS_VRF_ID_NAME_RECORD],
+ &nfv9_ingress_vrfid_name_record,
+ CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_LENGTH);
+
+ nfv9_logging_info->record_length[INGRESS_VRF_ID_NAME_RECORD]
+ += CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_LENGTH;
+
+ nfv9_logging_info->pkt_length +=
+ CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_LENGTH;
+
+ nfv9_logging_info->total_record_count += 1;
+
+ nfv9_logging_info->record[INGRESS_VRF_ID_NAME_RECORD]
+ += CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_LENGTH;
+
+ nfv9_logging_info->next_data_ptr =
+ nfv9_logging_info->record[INGRESS_VRF_ID_NAME_RECORD];
+
+ nfv9_logging_info->dataflow_header->dataflow_length =
+ clib_host_to_net_u32(
+ nfv9_logging_info->record_length[INGRESS_VRF_ID_NAME_RECORD]);
+ return;
+}
+/*
+ * edt: * * cnat_log_timer_handler
+ *
+ * Timer handler for sending any pending NFV9 record
+ *
+ * Argument: spp_timer_t * timer_p
+ * Timer handler structure
+ */
+void handle_pending_nfv9_pkts()
+{
+ vlib_node_t *output_node;
+ vlib_main_t * vm = vlib_get_main();
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info = 0;
+ u32 current_timestamp = cnat_nfv9_get_sys_up_time_in_ms();
+ u32 current_unix_time_in_seconds = cnat_nfv9_get_unix_time_in_seconds();
+
+ output_node = vlib_get_node_by_name (vm, (u8 *) "ip4-lookup");
+
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ my_nfv9_logging_info->server_index;
+ if (my_nfv9_logging_info->queued_logging_context ||
+ (my_nfv9_logging_info->current_logging_context &&
+ (current_timestamp -
+ my_nfv9_logging_info->current_logging_context_timestamp)
+ > 1000)) {
+ /*
+ * If there is a current logging context and timestamp
+ * indicates it is pending for long, send it out
+ * Also if there is a queued context send it out as well
+ */
+ vlib_cli_output(vm, "\nNFV9_TIMER: queued %p, curr %p",
+ my_nfv9_logging_info->queued_logging_context,
+ my_nfv9_logging_info->current_logging_context);
+
+
+ cnat_nfv9_send_pkt_always_success(my_nfv9_logging_info,
+ output_node);
+ } else {
+ /*
+ * If the last_template_sent_time is too far back in time
+ * send the template even if there is no NFv9 records to send
+ */
+ if ((my_nfv9_logging_info->queued_logging_context == NULL) &&
+ (my_nfv9_logging_info->current_logging_context == NULL) &&
+ ((current_unix_time_in_seconds -
+ server->last_template_sent_time) >
+ server->timeout_rate)) {
+ cnat_nfv9_create_logging_context(my_nfv9_logging_info,
+ cnat_nfv9_template_add_always);
+ if (PREDICT_TRUE(my_nfv9_logging_info->current_logging_context
+ != NULL)) {
+ cnat_nfv9_send_pkt(my_nfv9_logging_info);
+ }
+ }
+ }
+ }));
+}
+
+/*
+ * Code to initialize NFV9 Template. This is done when a NFV9 is enabled
+ * It is done only once and later used when sending NFV9 template records.
+ */
+static void
+cnat_nfv9_template_init (void)
+{
+ cnat_nfv9_template_info.flowset_id =
+ clib_host_to_net_u16(CNAT_NFV9_TEMPLATE_FLOWSET_ID);
+ cnat_nfv9_template_info.length =
+ clib_host_to_net_u16(CNAT_NFV9_TEMPLATE_LENGTH -
+ CNAT_NFV9_OPTION_TEMPLATE_LENGTH);
+ /*
+ * Create the add Template
+ */
+ cnat_nfv9_template_info.add_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_ADD_TEMPLATE_ID);
+ cnat_nfv9_template_info.add_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_ADD_FIELD_COUNT);
+
+ cnat_nfv9_template_info.add_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_outside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_outside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.add_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /*
+ * Create the delete Template
+ */
+ cnat_nfv9_template_info.del_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DEL_TEMPLATE_ID);
+ cnat_nfv9_template_info.del_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DEL_FIELD_COUNT);
+
+ cnat_nfv9_template_info.del_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.del_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.del_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.del_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+
+ /* Create NAT64 BIB Add template */
+#if 0
+ cnat_nfv9_template_info.nat64_add_bib_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_ADD_BIB_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat64_add_bib_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_ADD_BIB_FIELD_COUNT);
+
+
+ cnat_nfv9_template_info.nat64_add_bib_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_bib_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_bib_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_bib_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_bib_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_bib_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_bib_outside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_bib_outside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_bib_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_bib_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+
+ /* NAT64 BIB Delete */
+ cnat_nfv9_template_info.nat64_del_bib_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_DEL_BIB_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat64_del_bib_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_DEL_BIB_FIELD_COUNT);
+
+ cnat_nfv9_template_info.nat64_del_bib_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_bib_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_bib_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_bib_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_bib_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_bib_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+
+ /* NAt64 SESSION ADD */
+
+ cnat_nfv9_template_info.nat64_add_session_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_ADD_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat64_add_session_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_ADD_SESSION_FIELD_COUNT);
+
+
+ cnat_nfv9_template_info.nat64_add_session_inside_ipv6_src_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_inside_ipv6_src_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_src_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_src_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+
+ cnat_nfv9_template_info.nat64_add_session_inside_ipv6_dst_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_inside_ipv6_dst_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_SIZE);
+
+
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_dst_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_DST_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_dst_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_DST_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_session_inside_ip_src_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_inside_ip_src_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_src_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_outside_ip_src_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+
+ cnat_nfv9_template_info.nat64_add_session_ip_dest_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_ip_dest_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_add_session_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_add_session_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+
+
+ /* Session Delete */
+ cnat_nfv9_template_info.nat64_del_session_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_DEL_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat64_del_session_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT64_DEL_SESSION_FIELD_COUNT);
+
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_src_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_src_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_dst_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_dst_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_src_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_src_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_dst_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_session_inside_ip_dst_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat64_del_session_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat64_del_session_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+#endif
+ /*
+ * Create the nat44 session add Template
+ */
+ cnat_nfv9_template_info.nat44_session_add_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_ADD_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat44_session_add_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_ADD_SESSION_FIELD_COUNT);
+
+ cnat_nfv9_template_info.nat44_session_add_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_outside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_outside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_dest_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_dest_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_dest_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_dest_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_add_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_add_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /*
+ * Create the nat44 session del Template
+ */
+ cnat_nfv9_template_info.nat44_session_del_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_DEL_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.nat44_session_del_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_DEL_SESSION_FIELD_COUNT);
+
+ cnat_nfv9_template_info.nat44_session_del_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_del_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_del_dest_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_dest_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_del_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_del_dest_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_dest_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.nat44_session_del_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.nat44_session_del_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+ /*
+ * Ds-lite add template
+ */
+#if 0
+ cnat_nfv9_template_info.add_dslite_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_ADD_TEMPLATE_ID);
+ cnat_nfv9_template_info.add_dslite_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_ADD_FIELD_COUNT);
+
+ cnat_nfv9_template_info.add_dslite_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_outside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_outside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /*
+ * Ds-lite delete template
+ */
+ cnat_nfv9_template_info.del_dslite_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_DEL_TEMPLATE_ID);
+ cnat_nfv9_template_info.del_dslite_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_DEL_FIELD_COUNT);
+
+ cnat_nfv9_template_info.del_dslite_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /*
+ * Ds-lite session add template
+ */
+
+ cnat_nfv9_template_info.add_dslite_session_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_ADD_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.add_dslite_session_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_ADD_SESSION_FIELD_COUNT);
+
+ cnat_nfv9_template_info.add_dslite_session_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_outside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_outside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_dest_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_dest_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_dest_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_dest_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.add_dslite_session_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.add_dslite_session_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /*
+ * Ds-lite session delete template
+ */
+ cnat_nfv9_template_info.del_dslite_session_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_DEL_SESSION_TEMPLATE_ID);
+ cnat_nfv9_template_info.del_dslite_session_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_DEL_SESSION_FIELD_COUNT);
+
+ cnat_nfv9_template_info.del_dslite_session_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_inside_ip_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_inside_ip_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_dest_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_dest_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_dest_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_dest_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE);
+
+ cnat_nfv9_template_info.del_dslite_session_protocol_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_TYPE);
+ cnat_nfv9_template_info.del_dslite_session_protocol_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_PROTOCOL_FIELD_SIZE);
+
+ /* Create add bulk template */
+ cnat_nfv9_template_info.bulk_add_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_BULK_ADD_TEMPLATE_ID);
+ cnat_nfv9_template_info.bulk_add_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_BULK_ADD_FIELD_COUNT);
+
+ cnat_nfv9_template_info.bulk_add_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_add_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_add_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_add_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_add_outside_start_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_outside_start_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_add_outside_end_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_add_outside_end_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_SIZE);
+
+ /*
+ * Create the bulk delete Template
+ */
+ cnat_nfv9_template_info.bulk_del_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_BULK_DEL_TEMPLATE_ID);
+ cnat_nfv9_template_info.bulk_del_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_NAT44_BULK_DEL_FIELD_COUNT);
+
+ cnat_nfv9_template_info.bulk_del_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_del_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_del_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_del_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_del_outside_start_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_del_outside_start_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_SIZE);
+
+ /*
+ * Ds-lite bulk add template
+ */
+ cnat_nfv9_template_info.bulk_dslite_add_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_BULK_ADD_TEMPLATE_ID);
+ cnat_nfv9_template_info.bulk_dslite_add_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_BULK_ADD_FIELD_COUNT);
+
+ cnat_nfv9_template_info.bulk_dslite_add_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_outside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_outside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_outside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_outside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_outside_start_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_outside_start_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_add_outside_end_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_add_outside_end_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_SIZE);
+
+ /*
+ * Ds-lite bulk delete template
+ */
+
+ cnat_nfv9_template_info.bulk_dslite_del_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_BULK_DEL_TEMPLATE_ID);
+ cnat_nfv9_template_info.bulk_dslite_del_field_count =
+ clib_host_to_net_u16(CNAT_NFV9_DS_LITE_BULK_DEL_FIELD_COUNT);
+
+ cnat_nfv9_template_info.bulk_dslite_del_inside_vrf_id_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_del_inside_vrf_id_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_del_inside_ip_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_del_inside_ip_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_del_inside_ipv6_addr_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_del_inside_ipv6_addr_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE);
+
+ cnat_nfv9_template_info.bulk_dslite_del_outside_start_port_field_type =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_TYPE);
+ cnat_nfv9_template_info.bulk_dslite_del_outside_start_port_field_size =
+ clib_host_to_net_u16(CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_SIZE);
+
+#endif /* NO_BULK_LOGGING */
+
+ /*
+ * Ingress vrfid - name mapping
+ */
+ CNAT_NFV9_OPTION_TEMPLATE.flowset_id =
+ clib_host_to_net_u16(CNAT_NFV9_OPTION_TEMPLATE_FLOWSET_ID);
+ CNAT_NFV9_OPTION_TEMPLATE.length =
+ clib_host_to_net_u16(CNAT_NFV9_OPTION_TEMPLATE_LENGTH);
+
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_template_id =
+ clib_host_to_net_u16(CNAT_NFV9_INGRESS_VRF_ID_NAME_TEMPLATE_ID);
+ /* currently no scope field supported */
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_scope_len = 0;
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_option_len =
+ clib_host_to_net_u16(CNAT_NFV9_INGRESS_VRF_ID_NAME_OPTION_LEN);
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_vrfid_option_type =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE);
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_vrfid_option_len =
+ clib_host_to_net_u16(CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE);
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_vrfname_option_type =
+ clib_host_to_net_u16(CNAT_NFV9_INGRESS_VRF_NAME_FIELD_TYPE);
+ CNAT_NFV9_OPTION_TEMPLATE.ingress_vrfid_name_map_vrfname_option_len =
+ clib_host_to_net_u16(CNAT_NFV9_INGRESS_VRF_NAME_FIELD_SIZE);
+
+ /*
+ * Set the padding (which was added to make the size of template
+ * multiple of 4) to zero
+ */
+ CNAT_NFV9_OPTION_TEMPLATE.padding1 = 0;
+}
+
+/*
+ * one time function
+ * has to be called at the init time
+ */
+void cnat_nfv9_logging_init()
+{
+ if (!cnat_nfv9_global_info.cnat_nfv9_init_done) {
+ cnat_nfv9_template_init();
+
+ /* Pre allocate for NFV9_SERVER_POOL_SIZE. Will be good
+ * enough for most deployments
+ */
+ pool_alloc(nfv9_server_info_pool, NFV9_SERVER_POOL_SIZE);
+ int i;
+ nfv9_server_info_t *server __attribute__((unused));
+ for(i = 0; i < NFV9_SERVER_POOL_SIZE; i++) {
+ pool_get(nfv9_server_info_pool, server);
+ }
+
+ for(i = 0; i < NFV9_SERVER_POOL_SIZE; i++) {
+ pool_put(nfv9_server_info_pool, nfv9_server_info_pool + i);
+ }
+
+ memset(&cnat_nfv9_global_info, 0 , sizeof(cnat_nfv9_global_info_t));
+ ASSERT(cnat_nfv9_global_info.cnat_nfv9_disp_node_index != (u16)~0);
+
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index = EMPTY;
+ cnat_nfv9_global_info.cnat_nfv9_init_done = 1;
+
+ /*
+ * src id is set to infra IPv4 address + octeon core number
+ */
+ nfv9_src_id = my_instance_number;
+ }
+}
diff --git a/vnet/vnet/vcgn/cnat_logging.h b/vnet/vnet/vcgn/cnat_logging.h
new file mode 100644
index 00000000000..7bd43ecf21e
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_logging.h
@@ -0,0 +1,1091 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_logging.h
+ *
+ * Copyright (c) 2009, 2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_LOGGING_H__
+#define __CNAT_LOGGING_H__
+
+#include <stdio.h>
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+
+#include "nat64_db.h"
+#include "cnat_log_common.h"
+#include "dslite_defs.h"
+
+#define NFV9_DEF_PATH_MTU 1500
+#define NFV9_VRF_NAME_LEN 12
+
+/* one time call at the beginning */
+void cnat_nfv9_logging_init();
+
+/*
+ * unconditional call
+ * will check logging config inside
+ */
+void cnat_nfv9_log_mapping_create(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+void cnat_nfv9_nat44_log_session_create(cnat_main_db_entry_t * db,
+ cnat_session_entry_t * sdb,
+ cnat_vrfmap_t *vrfmap);
+
+void cnat_nfv9_nat44_log_session_delete(cnat_main_db_entry_t * db,
+ cnat_session_entry_t * sdb,
+ cnat_vrfmap_t *vrfmap);
+
+
+/*
+ * unconditional call
+ * will check logging config inside
+ */
+void cnat_nfv9_log_mapping_delete(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+/* nat44 syslog APIs */
+void cnat_syslog_nat44_mapping_create(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t * sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+void cnat_syslog_nat44_mapping_delete(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+/*
+ * dslite
+ */
+void cnat_nfv9_ds_lite_mapping_create(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+void cnat_nfv9_ds_lite_mapping_delete(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+void cnat_nfv9_ds_lite_log_session_create(cnat_main_db_entry_t * db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t * sdb);
+
+void cnat_nfv9_ds_lite_log_session_delete(cnat_main_db_entry_t * db,
+ dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t * sdb);
+
+/*
+ * nat64
+ */
+
+void cnat_nfv9_bib_mapping_create (nat64_bib_entry_t *db,
+ nat64_table_entry_t *nat64_entry);
+
+void cnat_nfv9_session_mapping_create (nat64_bib_entry_t *bdb,
+ nat64_session_entry_t *sdb,
+ nat64_table_entry_t *nat64_entry_ptr);
+
+void cnat_nfv9_bib_mapping_delete (nat64_bib_entry_t *db,
+ nat64_table_entry_t *nat64_entry);
+
+void cnat_nfv9_session_mapping_delete (nat64_bib_entry_t *bdb,
+ nat64_session_entry_t *sdb,
+ nat64_table_entry_t *nat64_entry_ptr);
+
+typedef enum {
+ RECORD_INVALID = 0,
+ NAT44_ADD_RECORD,
+ NAT44_DEL_RECORD,
+ NAT64_ADD_BIB_RECORD,
+ NAT64_DEL_BIB_RECORD,
+ NAT64_ADD_SESSION_RECORD,
+ NAT64_DEL_SESSION_RECORD,
+ DS_LITE_ADD_RECORD,
+ DS_LITE_DEL_RECORD,
+ NAT44_BULK_ADD_RECORD,
+ NAT44_BULK_DEL_RECORD,
+ DS_LITE_BULK_ADD_RECORD,
+ DS_LITE_BULK_DEL_RECORD,
+ INGRESS_VRF_ID_NAME_RECORD,
+ NAT44_ADD_SESSION_RECORD,
+ NAT44_DEL_SESSION_RECORD,
+ DS_LITE_ADD_SESSION_RECORD,
+ DS_LITE_DEL_SESSION_RECORD,
+ MAX_RECORDS
+} netflow_record;
+
+typedef enum {
+ TEMPLATE_SENT_FALSE = 0,
+ TEMPLATE_SENT_TRUE = 1
+} netflow_template_sent;
+
+#define cnat_nfv9_get_sys_up_time_in_ms cnat_get_sys_up_time_in_ms
+
+#define cnat_nfv9_get_unix_time_in_seconds cnat_get_unix_time_in_seconds
+
+#define cnat_nfv9_dump_time_change_logs cnat_dump_time_change_logs
+
+
+/*
+ * Netflow V9 Specific Defines and structures
+ */
+
+#define CNAT_NFV9_VERSION_NUMBER 9
+
+#define CNAT_NFV9_TEMPLATE_FLOWSET_ID 0
+#define CNAT_NFV9_OPTION_TEMPLATE_FLOWSET_ID 1
+
+#define CNAT_NFV9_ADD_FIELD_COUNT 7
+#define CNAT_NFV9_DEL_FIELD_COUNT 4
+#define CNAT_NFV9_DS_LITE_ADD_FIELD_COUNT 8
+#define CNAT_NFV9_DS_LITE_DEL_FIELD_COUNT 5
+#define CNAT_NFV9_NAT64_ADD_BIB_FIELD_COUNT 5
+#define CNAT_NFV9_NAT64_DEL_BIB_FIELD_COUNT 3
+#define CNAT_NFV9_NAT64_ADD_SESSION_FIELD_COUNT 8
+#define CNAT_NFV9_NAT64_DEL_SESSION_FIELD_COUNT 5
+#define CNAT_NFV9_NAT44_ADD_SESSION_FIELD_COUNT 9
+#define CNAT_NFV9_NAT44_DEL_SESSION_FIELD_COUNT 6
+#define CNAT_NFV9_DS_LITE_ADD_SESSION_FIELD_COUNT 10
+#define CNAT_NFV9_DS_LITE_DEL_SESSION_FIELD_COUNT 7
+
+#define CNAT_NFV9_ADD_TEMPLATE_ID 256
+#define CNAT_NFV9_DEL_TEMPLATE_ID 257
+#define CNAT_NFV9_NAT64_ADD_BIB_TEMPLATE_ID 258
+#define CNAT_NFV9_NAT64_DEL_BIB_TEMPLATE_ID 259
+#define CNAT_NFV9_NAT64_ADD_SESSION_TEMPLATE_ID 260
+#define CNAT_NFV9_NAT64_DEL_SESSION_TEMPLATE_ID 261
+#define CNAT_NFV9_INGRESS_VRF_ID_NAME_TEMPLATE_ID 262
+#define CNAT_NFV9_DS_LITE_ADD_TEMPLATE_ID 267
+#define CNAT_NFV9_DS_LITE_DEL_TEMPLATE_ID 268
+#define CNAT_NFV9_NAT44_ADD_SESSION_TEMPLATE_ID 271
+#define CNAT_NFV9_NAT44_DEL_SESSION_TEMPLATE_ID 272
+#define CNAT_NFV9_DS_LITE_ADD_SESSION_TEMPLATE_ID 273
+#define CNAT_NFV9_DS_LITE_DEL_SESSION_TEMPLATE_ID 274
+
+#ifndef NO_BULK_LOGGING
+#define CNAT_NFV9_NAT44_BULK_ADD_TEMPLATE_ID 265
+#define CNAT_NFV9_NAT44_BULK_DEL_TEMPLATE_ID 266
+#define CNAT_NFV9_DS_LITE_BULK_ADD_TEMPLATE_ID 269
+#define CNAT_NFV9_DS_LITE_BULK_DEL_TEMPLATE_ID 270
+
+#define CNAT_NFV9_NAT44_BULK_ADD_FIELD_COUNT 6
+#define CNAT_NFV9_NAT44_BULK_DEL_FIELD_COUNT 3
+#define CNAT_NFV9_DS_LITE_BULK_ADD_FIELD_COUNT 7
+#define CNAT_NFV9_DS_LITE_BULK_DEL_FIELD_COUNT 4
+
+#define CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_TYPE 361
+#define CNAT_NFV9_OUTSIDE_IP_PORT_START_FIELD_SIZE 2
+
+#define CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_TYPE 362
+#define CNAT_NFV9_OUTSIDE_IP_PORT_END_FIELD_SIZE 2
+
+#endif /* #ifndef NO_BULK_LOGGING */
+
+#define CNAT_NFV9_INGRESS_VRF_NAME_FIELD_TYPE 236
+#define CNAT_NFV9_INGRESS_VRF_NAME_FIELD_SIZE 12
+/* 4 byte for vrf_id + 4 byte for vrf_name (option fields) */
+#define CNAT_NFV9_INGRESS_VRF_ID_NAME_OPTION_LEN 8
+extern u16 cnat_template_id[MAX_RECORDS];
+
+#define CNAT_NFV9_INSIDE_VRFID_FIELD_TYPE 234
+#define CNAT_NFV9_INSIDE_VRFID_FIELD_SIZE 4
+
+#define CNAT_NFV9_OUTSIDE_VRFID_FIELD_TYPE 235
+#define CNAT_NFV9_OUTSIDE_VRFID_FIELD_SIZE 4
+
+#define CNAT_NFV9_INSIDE_IP_ADDR_FIELD_TYPE 8
+#define CNAT_NFV9_INSIDE_IP_ADDR_FIELD_SIZE 4
+
+#define CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_TYPE 225
+#define CNAT_NFV9_OUTSIDE_IP_ADDR_FIELD_SIZE 4
+
+#define CNAT_NFV9_INSIDE_IP_PORT_FIELD_TYPE 7
+#define CNAT_NFV9_INSIDE_IP_PORT_FIELD_SIZE 2
+
+#define CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_TYPE 227
+#define CNAT_NFV9_OUTSIDE_IP_PORT_FIELD_SIZE 2
+
+#define CNAT_NFV9_PROTOCOL_FIELD_TYPE 4
+#define CNAT_NFV9_PROTOCOL_FIELD_SIZE 1
+
+/* IPv6 related info */
+
+#define CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_TYPE 27
+#define CNAT_NFV9_INSIDE_IPV6_SRC_ADDR_FIELD_SIZE 16
+
+#define CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_TYPE 28
+#define CNAT_NFV9_INSIDE_IPV6_DST_ADDR_FIELD_SIZE 16
+
+#define CNAT_NFV9_OUTSIDE_IP_DST_ADDR_FIELD_TYPE 226
+#define CNAT_NFV9_OUTSIDE_IP_DST_ADDR_FIELD_SIZE 4
+
+#define CNAT_NFV9_INSIDE_DST_PORT_FIELD_TYPE 11
+#define CNAT_NFV9_INSIDE_DST_PORT_FIELD_SIZE 2
+
+#define CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_TYPE 12
+#define CNAT_NFV9_DESTINATION_IP_ADDR_FIELD_SIZE 4
+
+
+typedef struct {
+ u16 version;
+ u16 count;
+ u32 sys_up_time; /* time in ms since system was booted */
+ u32 timestamp; /* UNIX time in seconds since 1970 */
+ u32 sequence_num;
+ u32 source_id;
+} nfv9_header_t;
+
+/*
+ * Hardcoded - need to be fixed
+ */
+#define CNAT_NFV9_SOURCE_ID_VALUE 0x1234
+
+typedef struct {
+ u16 flowset_id;
+ u16 length;
+
+ u16 ingress_vrfid_name_map_template_id;
+ u16 ingress_vrfid_name_map_scope_len;
+ u16 ingress_vrfid_name_map_option_len;
+ u16 ingress_vrfid_name_map_vrfid_option_type;
+ u16 ingress_vrfid_name_map_vrfid_option_len;
+ u16 ingress_vrfid_name_map_vrfname_option_type;
+ u16 ingress_vrfid_name_map_vrfname_option_len;
+ /*
+ * Adding the padding so as to make the tempalate
+ * structure end on a 4 byte boundary
+ */
+ u16 padding1;
+
+} cnat_nfv9_option_template_t;
+
+/*
+ * The following structure defines the Netflow Template that
+ * will be exported to the Netflow Collector
+ */
+
+typedef struct {
+ u16 flowset_id;
+ u16 length;
+
+ u16 add_template_id;
+ u16 add_field_count;
+ u16 add_inside_vrf_id_field_type;
+ u16 add_inside_vrf_id_field_size;
+ u16 add_outside_vrf_id_field_type;
+ u16 add_outside_vrf_id_field_size;
+ u16 add_inside_ip_addr_field_type;
+ u16 add_inside_ip_addr_field_size;
+ u16 add_outside_ip_addr_field_type;
+ u16 add_outside_ip_addr_field_size;
+ u16 add_inside_ip_port_field_type;
+ u16 add_inside_ip_port_field_size;
+ u16 add_outside_ip_port_field_type;
+ u16 add_outside_ip_port_field_size;
+ u16 add_protocol_field_type;
+ u16 add_protocol_field_size;
+
+ u16 del_template_id;
+ u16 del_field_count;
+ u16 del_inside_vrf_id_field_type;
+ u16 del_inside_vrf_id_field_size;
+ u16 del_inside_ip_addr_field_type;
+ u16 del_inside_ip_addr_field_size;
+ u16 del_inside_ip_port_field_type;
+ u16 del_inside_ip_port_field_size;
+ u16 del_protocol_field_type;
+ u16 del_protocol_field_size;
+#if 0
+ /* NAT64 related info */
+ u16 nat64_add_bib_template_id;
+ u16 nat64_add_bib_field_count;
+ u16 nat64_add_bib_inside_ipv6_addr_field_type;
+ u16 nat64_add_bib_inside_ipv6_addr_field_size;
+ u16 nat64_add_bib_outside_ip_addr_field_type;
+ u16 nat64_add_bib_outside_ip_addr_field_size;
+ u16 nat64_add_bib_inside_ip_port_field_type;
+ u16 nat64_add_bib_inside_ip_port_field_size;
+ u16 nat64_add_bib_outside_ip_port_field_type;
+ u16 nat64_add_bib_outside_ip_port_field_size;
+ u16 nat64_add_bib_protocol_field_type;
+ u16 nat64_add_bib_protocol_field_size;
+
+ u16 nat64_del_bib_template_id;
+ u16 nat64_del_bib_field_count;
+ u16 nat64_del_bib_inside_ip_addr_field_type;
+ u16 nat64_del_bib_inside_ip_addr_field_size;
+ u16 nat64_del_bib_inside_ip_port_field_type;
+ u16 nat64_del_bib_inside_ip_port_field_size;
+ u16 nat64_del_bib_protocol_field_type;
+ u16 nat64_del_bib_protocol_field_size;
+
+
+ u16 nat64_add_session_template_id;
+ u16 nat64_add_session_field_count;
+ u16 nat64_add_session_inside_ipv6_src_addr_field_type;
+ u16 nat64_add_session_inside_ipv6_src_addr_field_size;
+ u16 nat64_add_session_outside_ip_src_addr_field_type;
+ u16 nat64_add_session_outside_ip_src_addr_field_size;
+ u16 nat64_add_session_inside_ipv6_dst_addr_field_type;
+ u16 nat64_add_session_inside_ipv6_dst_addr_field_size;
+ u16 nat64_add_session_outside_ip_dst_addr_field_type;
+ u16 nat64_add_session_outside_ip_dst_addr_field_size;
+ u16 nat64_add_session_inside_ip_src_port_field_type;
+ u16 nat64_add_session_inside_ip_src_port_field_size;
+ u16 nat64_add_session_outside_ip_src_port_field_type;
+ u16 nat64_add_session_outside_ip_src_port_field_size;
+ u16 nat64_add_session_ip_dest_port_field_type;
+ u16 nat64_add_session_ip_dest_port_field_size;
+ u16 nat64_add_session_protocol_field_type;
+ u16 nat64_add_session_protocol_field_size;
+
+ u16 nat64_del_session_template_id;
+ u16 nat64_del_session_field_count;
+ u16 nat64_del_session_inside_ip_src_addr_field_type;
+ u16 nat64_del_session_inside_ip_src_addr_field_size;
+ u16 nat64_del_session_inside_ip_dst_addr_field_type;
+ u16 nat64_del_session_inside_ip_dst_addr_field_size;
+ u16 nat64_del_session_inside_ip_src_port_field_type;
+ u16 nat64_del_session_inside_ip_src_port_field_size;
+ u16 nat64_del_session_inside_ip_dst_port_field_type;
+ u16 nat64_del_session_inside_ip_dst_port_field_size;
+ u16 nat64_del_session_protocol_field_type;
+ u16 nat64_del_session_protocol_field_size;
+
+ /*
+ * Ds-Lite specific info
+ */
+ u16 add_dslite_template_id;
+ u16 add_dslite_field_count;
+ u16 add_dslite_inside_vrf_id_field_type;
+ u16 add_dslite_inside_vrf_id_field_size;
+ u16 add_dslite_outside_vrf_id_field_type;
+ u16 add_dslite_outside_vrf_id_field_size;
+ u16 add_dslite_inside_ip_addr_field_type;
+ u16 add_dslite_inside_ip_addr_field_size;
+ u16 add_dslite_inside_ipv6_addr_field_type;
+ u16 add_dslite_inside_ipv6_addr_field_size;
+ u16 add_dslite_outside_ip_addr_field_type;
+ u16 add_dslite_outside_ip_addr_field_size;
+ u16 add_dslite_inside_ip_port_field_type;
+ u16 add_dslite_inside_ip_port_field_size;
+ u16 add_dslite_outside_ip_port_field_type;
+ u16 add_dslite_outside_ip_port_field_size;
+ u16 add_dslite_protocol_field_type;
+ u16 add_dslite_protocol_field_size;
+
+ u16 del_dslite_template_id;
+ u16 del_dslite_field_count;
+ u16 del_dslite_inside_vrf_id_field_type;
+ u16 del_dslite_inside_vrf_id_field_size;
+ u16 del_dslite_inside_ip_addr_field_type;
+ u16 del_dslite_inside_ip_addr_field_size;
+ u16 del_dslite_inside_ipv6_addr_field_type;
+ u16 del_dslite_inside_ipv6_addr_field_size;
+ u16 del_dslite_inside_ip_port_field_type;
+ u16 del_dslite_inside_ip_port_field_size;
+ u16 del_dslite_protocol_field_type;
+ u16 del_dslite_protocol_field_size;
+#endif
+
+//#ifndef NO_BULK_LOGGING /* commenting for time being */
+#if 0
+ u16 bulk_add_template_id;
+ u16 bulk_add_field_count;
+ u16 bulk_add_inside_vrf_id_field_type;
+ u16 bulk_add_inside_vrf_id_field_size;
+ u16 bulk_add_outside_vrf_id_field_type;
+ u16 bulk_add_outside_vrf_id_field_size;
+ u16 bulk_add_inside_ip_addr_field_type;
+ u16 bulk_add_inside_ip_addr_field_size;
+ u16 bulk_add_outside_ip_addr_field_type;
+ u16 bulk_add_outside_ip_addr_field_size;
+ u16 bulk_add_outside_start_port_field_type;
+ u16 bulk_add_outside_start_port_field_size;
+ u16 bulk_add_outside_end_port_field_type;
+ u16 bulk_add_outside_end_port_field_size;
+
+ u16 bulk_del_template_id;
+ u16 bulk_del_field_count;
+ u16 bulk_del_inside_vrf_id_field_type;
+ u16 bulk_del_inside_vrf_id_field_size;
+ u16 bulk_del_inside_ip_addr_field_type;
+ u16 bulk_del_inside_ip_addr_field_size;
+ u16 bulk_del_outside_start_port_field_type;
+ u16 bulk_del_outside_start_port_field_size;
+
+ /* ds-lite bulk logging create delete event */
+
+ u16 bulk_dslite_add_template_id;
+ u16 bulk_dslite_add_field_count;
+ u16 bulk_dslite_add_inside_vrf_id_field_type;
+ u16 bulk_dslite_add_inside_vrf_id_field_size;
+ u16 bulk_dslite_add_outside_vrf_id_field_type;
+ u16 bulk_dslite_add_outside_vrf_id_field_size;
+ u16 bulk_dslite_add_inside_ip_addr_field_type;
+ u16 bulk_dslite_add_inside_ip_addr_field_size;
+ u16 bulk_dslite_add_inside_ipv6_addr_field_type;
+ u16 bulk_dslite_add_inside_ipv6_addr_field_size;
+ u16 bulk_dslite_add_outside_ip_addr_field_type;
+ u16 bulk_dslite_add_outside_ip_addr_field_size;
+ u16 bulk_dslite_add_outside_start_port_field_type;
+ u16 bulk_dslite_add_outside_start_port_field_size;
+ u16 bulk_dslite_add_outside_end_port_field_type;
+ u16 bulk_dslite_add_outside_end_port_field_size;
+
+ u16 bulk_dslite_del_template_id;
+ u16 bulk_dslite_del_field_count;
+ u16 bulk_dslite_del_inside_vrf_id_field_type;
+ u16 bulk_dslite_del_inside_vrf_id_field_size;
+ u16 bulk_dslite_del_inside_ip_addr_field_type;
+ u16 bulk_dslite_del_inside_ip_addr_field_size;
+ u16 bulk_dslite_del_inside_ipv6_addr_field_type;
+ u16 bulk_dslite_del_inside_ipv6_addr_field_size;
+ u16 bulk_dslite_del_outside_start_port_field_type;
+ u16 bulk_dslite_del_outside_start_port_field_size;
+
+#endif /* NO_BULK_LOGGING */
+
+ u16 nat44_session_add_template_id;
+ u16 nat44_session_add_field_count;
+ u16 nat44_session_add_inside_vrf_id_field_type;
+ u16 nat44_session_add_inside_vrf_id_field_size;
+ u16 nat44_session_add_outside_vrf_id_field_type;
+ u16 nat44_session_add_outside_vrf_id_field_size;
+ u16 nat44_session_add_inside_ip_addr_field_type;
+ u16 nat44_session_add_inside_ip_addr_field_size;
+ u16 nat44_session_add_outside_ip_addr_field_type;
+ u16 nat44_session_add_outside_ip_addr_field_size;
+ u16 nat44_session_add_inside_ip_port_field_type;
+ u16 nat44_session_add_inside_ip_port_field_size;
+ u16 nat44_session_add_outside_ip_port_field_type;
+ u16 nat44_session_add_outside_ip_port_field_size;
+ u16 nat44_session_add_dest_ip_addr_field_type;
+ u16 nat44_session_add_dest_ip_addr_field_size;
+ u16 nat44_session_add_dest_port_field_type;
+ u16 nat44_session_add_dest_port_field_size;
+ u16 nat44_session_add_protocol_field_type;
+ u16 nat44_session_add_protocol_field_size;
+
+ u16 nat44_session_del_template_id;
+ u16 nat44_session_del_field_count;
+ u16 nat44_session_del_inside_vrf_id_field_type;
+ u16 nat44_session_del_inside_vrf_id_field_size;
+ u16 nat44_session_del_inside_ip_addr_field_type;
+ u16 nat44_session_del_inside_ip_addr_field_size;
+ u16 nat44_session_del_dest_ip_addr_field_type;
+ u16 nat44_session_del_dest_ip_addr_field_size;
+ u16 nat44_session_del_inside_ip_port_field_type;
+ u16 nat44_session_del_inside_ip_port_field_size;
+ u16 nat44_session_del_dest_port_field_type;
+ u16 nat44_session_del_dest_port_field_size;
+ u16 nat44_session_del_protocol_field_type;
+ u16 nat44_session_del_protocol_field_size;
+
+#if 0
+ u16 add_dslite_session_template_id;
+ u16 add_dslite_session_field_count;
+ u16 add_dslite_session_inside_vrf_id_field_type;
+ u16 add_dslite_session_inside_vrf_id_field_size;
+ u16 add_dslite_session_outside_vrf_id_field_type;
+ u16 add_dslite_session_outside_vrf_id_field_size;
+ u16 add_dslite_session_inside_ip_addr_field_type;
+ u16 add_dslite_session_inside_ip_addr_field_size;
+ u16 add_dslite_session_inside_ipv6_addr_field_type;
+ u16 add_dslite_session_inside_ipv6_addr_field_size;
+ u16 add_dslite_session_outside_ip_addr_field_type;
+ u16 add_dslite_session_outside_ip_addr_field_size;
+ u16 add_dslite_session_inside_ip_port_field_type;
+ u16 add_dslite_session_inside_ip_port_field_size;
+ u16 add_dslite_session_outside_ip_port_field_type;
+ u16 add_dslite_session_outside_ip_port_field_size;
+ u16 add_dslite_session_dest_ip_addr_field_type;
+ u16 add_dslite_session_dest_ip_addr_field_size;
+ u16 add_dslite_session_dest_port_field_type;
+ u16 add_dslite_session_dest_port_field_size;
+ u16 add_dslite_session_protocol_field_type;
+ u16 add_dslite_session_protocol_field_size;
+
+ u16 del_dslite_session_template_id;
+ u16 del_dslite_session_field_count;
+ u16 del_dslite_session_inside_vrf_id_field_type;
+ u16 del_dslite_session_inside_vrf_id_field_size;
+ u16 del_dslite_session_inside_ip_addr_field_type;
+ u16 del_dslite_session_inside_ip_addr_field_size;
+ u16 del_dslite_session_inside_ipv6_addr_field_type;
+ u16 del_dslite_session_inside_ipv6_addr_field_size;
+ u16 del_dslite_session_dest_ip_addr_field_type;
+ u16 del_dslite_session_dest_ip_addr_field_size;
+ u16 del_dslite_session_inside_ip_port_field_type;
+ u16 del_dslite_session_inside_ip_port_field_size;
+ u16 del_dslite_session_dest_port_field_type;
+ u16 del_dslite_session_dest_port_field_size;
+ u16 del_dslite_session_protocol_field_type;
+ u16 del_dslite_session_protocol_field_size;
+#endif
+
+ /*
+ * Netflow option template
+ * Ingress VRF ID - Name mapping
+ * This template will be sent under flowset id 1
+ */
+ cnat_nfv9_option_template_t cnat_nfv9_option_template;
+} cnat_nfv9_template_t;
+
+/*
+ * The Dataflow header for each add/delete record group
+ */
+typedef struct {
+ u16 dataflow_template_id;
+ u16 dataflow_length;
+} nfv9_dataflow_record_header_t;
+
+/*
+ * NFv9 Add record definition
+ */
+
+/*
+ * pad bytes needed to make the structure a multiple of 4 bytes
+ */
+#define CNAT_NFV9_ADD_RECORD_PAD_BYTES (3)
+#define CNAT_NFV9_DEL_RECORD_PAD_BYTES (1)
+
+#define CNAT_NFV9_NAT64_ADD_BIB_RECORD_PAD_BYTES (3)
+#define CNAT_NFV9_NAT64_DEL_BIB_RECORD_PAD_BYTES (1)
+#define CNAT_NFV9_NAT64_ADD_SESSION_RECORD_PAD_BYTES (1)
+#define CNAT_NFV9_NAT64_DEL_SESSION_RECORD_PAD_BYTES (3)
+#define CNAT_NFV9_NAT44_ADD_SESSION_RECORD_PAD_BYTES (1)
+#define CNAT_NFV9_NAT44_DEL_SESSION_RECORD_PAD_BYTES (3)
+
+#define CNAT_NFV9_DS_LITE_ADD_RECORD_PAD_BYTES (3)
+#define CNAT_NFV9_DS_LITE_DEL_RECORD_PAD_BYTES (1)
+#define CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_PAD_BYTES (1)
+#define CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_PAD_BYTES (3)
+
+#define CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_PAD_BYTES (0)
+
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 outside_ip_addr;
+ u16 inside_ip_port;
+ u16 outside_ip_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_ADD_RECORD_PAD_BYTES];
+} nfv9_add_record_t;
+
+/*
+ * NFv9 Delete record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u16 inside_ip_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_DEL_RECORD_PAD_BYTES];
+} nfv9_del_record_t;
+
+#ifndef NO_BULK_LOGGING
+
+#define CNAT_NFV9_BULK_ADD_RECORD_PAD_BYTES (0)
+#define CNAT_NFV9_BULK_DEL_RECORD_PAD_BYTES (2)
+
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 outside_ip_addr;
+ u16 outside_ip_port_start;
+ u16 outside_ip_port_end;
+ u8 pad[CNAT_NFV9_BULK_ADD_RECORD_PAD_BYTES];
+} nfv9_bulk_add_record_t;
+
+/*
+ * NFv9 Delete record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u16 outside_ip_port_start;
+ u8 pad[CNAT_NFV9_BULK_DEL_RECORD_PAD_BYTES];
+} nfv9_bulk_del_record_t;
+
+/*
+ * DS-lite bulk port (user based) add record definition
+ */
+
+#define CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_PAD_BYTES (0)
+#define CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_PAD_BYTES (2)
+
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u32 outside_ip_addr;
+ u16 outside_ip_port_start;
+ u16 outside_ip_port_end;
+ u8 pad[CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_PAD_BYTES];
+} nfv9_ds_lite_bulk_add_record_t;
+
+
+/*
+ * DS-lite bulk port (user based) delete record definition
+ */
+
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u16 outside_ip_port_start;
+ u8 pad[CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_PAD_BYTES];
+} nfv9_ds_lite_bulk_del_record_t;
+
+#endif /* NO_BULK_LOGGING */
+
+/* NAT64 related structures */
+
+typedef struct {
+ u32 inside_v6_src_addr[4];
+ u32 outside_v4_src_addr;
+ u16 inside_src_port;
+ u16 outside_src_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT64_ADD_BIB_RECORD_PAD_BYTES];
+} nfv9_nat64_add_bib_record_t;
+
+
+typedef struct {
+ u32 inside_v6_src_addr[4];
+ u32 outside_v4_src_addr;
+ u32 inside_v6_dest_addr[4];
+ u32 outside_v4_dest_addr;
+ u16 inside_src_port;
+ u16 outside_src_port;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT64_ADD_SESSION_RECORD_PAD_BYTES];
+} nfv9_nat64_add_session_record_t;
+
+
+typedef struct {
+ u32 inside_v6_src_addr[4];
+ u16 inside_src_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT64_DEL_BIB_RECORD_PAD_BYTES];
+} nfv9_nat64_del_bib_record_t;
+
+
+typedef struct {
+ u32 inside_v6_src_addr[4];
+ u32 inside_v6_dest_addr[4];
+ u16 inside_src_port;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT64_DEL_SESSION_RECORD_PAD_BYTES];
+} nfv9_nat64_del_session_record_t;
+
+/*
+ * NFv9 Session based Add record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 outside_ip_addr;
+ u16 inside_ip_port;
+ u16 outside_ip_port;
+ u32 dest_ip_addr;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT44_ADD_SESSION_RECORD_PAD_BYTES];
+} nfv9_add_session_record_t;
+
+/*
+ * NFv9 Session based del record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u32 dest_ip_addr;
+ u16 inside_ip_port;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_NAT44_DEL_SESSION_RECORD_PAD_BYTES];
+} nfv9_del_session_record_t;
+
+/*
+ * DS-lite NFv9 create record structure
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u32 outside_ip_addr;
+ u16 inside_ip_port;
+ u16 outside_ip_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_DS_LITE_ADD_RECORD_PAD_BYTES];
+} nfv9_ds_lite_add_record_t;
+
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u16 inside_ip_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_DS_LITE_DEL_RECORD_PAD_BYTES];
+} nfv9_ds_lite_del_record_t;
+
+/*
+ * NFv9 Session based Add record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 outside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u32 outside_ip_addr;
+ u16 inside_ip_port;
+ u16 outside_ip_port;
+ u32 dest_ip_addr;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_PAD_BYTES];
+} nfv9_ds_lite_add_session_record_t;
+
+/*
+ * NFv9 Session based del record definition
+ */
+typedef struct {
+ u32 inside_vrf_id;
+ u32 inside_ip_addr;
+ u32 inside_v6_src_addr[4];
+ u32 dest_ip_addr;
+ u16 inside_ip_port;
+ u16 dest_port;
+ u8 protocol;
+ u8 pad[CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_PAD_BYTES];
+} nfv9_ds_lite_del_session_record_t;
+
+
+typedef struct {
+ u32 ingress_vrf_id;
+ u8 ingress_vrf_name[NFV9_VRF_NAME_LEN];
+ u8 pad[CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_PAD_BYTES];
+} nfv9_ingress_vrfid_name_record_t;
+
+#define CNAT_NFV9_TEMPLATE_OFFSET \
+ (CNAT_NFV9_HDR_OFFSET + sizeof(nfv9_header_t))
+
+#define CNAT_NFV9_TEMPLATE_LENGTH (sizeof(cnat_nfv9_template_t))
+#define CNAT_NFV9_OPTION_TEMPLATE_LENGTH (sizeof(cnat_nfv9_option_template_t))
+
+#define CNAT_NFV9_DATAFLOW_RECORD_HEADER_LENGTH \
+ (sizeof(nfv9_dataflow_record_header_t))
+
+/*
+ * No padding is needed for the add/delete records - reduce padding bytes
+ */
+
+#define CNAT_NFV9_ADD_RECORD_LENGTH (sizeof(nfv9_add_record_t) - \
+ CNAT_NFV9_ADD_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_DEL_RECORD_LENGTH (sizeof(nfv9_del_record_t) - \
+ CNAT_NFV9_DEL_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_DS_LITE_ADD_RECORD_LENGTH (sizeof(nfv9_ds_lite_add_record_t) - \
+ CNAT_NFV9_DS_LITE_ADD_RECORD_PAD_BYTES)
+#define CNAT_NFV9_DS_LITE_DEL_RECORD_LENGTH (sizeof(nfv9_ds_lite_del_record_t) - \
+ CNAT_NFV9_DS_LITE_DEL_RECORD_PAD_BYTES)
+#ifndef NO_BULK_LOGGING
+#define CNAT_NFV9_BULK_ADD_RECORD_LENGTH (sizeof(nfv9_bulk_add_record_t) - \
+ CNAT_NFV9_BULK_ADD_RECORD_PAD_BYTES)
+#define CNAT_NFV9_BULK_DEL_RECORD_LENGTH (sizeof(nfv9_bulk_del_record_t) - \
+ CNAT_NFV9_BULK_DEL_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_LENGTH (sizeof(nfv9_ds_lite_bulk_add_record_t) - \
+ CNAT_NFV9_DS_LITE_BULK_ADD_RECORD_PAD_BYTES)
+#define CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_LENGTH (sizeof(nfv9_ds_lite_bulk_del_record_t) - \
+ CNAT_NFV9_DS_LITE_BULK_DEL_RECORD_PAD_BYTES)
+
+
+#endif /* NO_BULK_LOGGING */
+
+#define CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_LENGTH (sizeof(nfv9_ingress_vrfid_name_record_t) - \
+ CNAT_NFV9_INGRESS_VRFID_NAME_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT64_ADD_BIB_RECORD_LENGTH \
+ (sizeof(nfv9_nat64_add_bib_record_t) - \
+ CNAT_NFV9_NAT64_ADD_BIB_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT64_DEL_BIB_RECORD_LENGTH \
+ (sizeof(nfv9_nat64_del_bib_record_t) - \
+ CNAT_NFV9_NAT64_DEL_BIB_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT64_ADD_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_nat64_add_session_record_t) - \
+ CNAT_NFV9_NAT64_ADD_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT64_DEL_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_nat64_del_session_record_t) - \
+ CNAT_NFV9_NAT64_DEL_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_MAX_SINGLE_RECORD_LENGTH \
+ (sizeof(nfv9_ds_lite_add_session_record_t) - \
+ CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT44_ADD_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_add_session_record_t) -\
+ CNAT_NFV9_NAT44_ADD_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_NAT44_DEL_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_del_session_record_t) -\
+ CNAT_NFV9_NAT44_DEL_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_ds_lite_add_session_record_t) -\
+ CNAT_NFV9_DS_LITE_ADD_SESSION_RECORD_PAD_BYTES)
+
+#define CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_LENGTH \
+ (sizeof(nfv9_ds_lite_del_session_record_t) -\
+ CNAT_NFV9_DS_LITE_DEL_SESSION_RECORD_PAD_BYTES)
+
+/*
+ * Minimum value of the path MTU value
+ */
+#define CNAT_NFV9_MIN_RECORD_SIZE (60 + \
+ CNAT_NFV9_DATAFLOW_RECORD_HEADER_LENGTH + \
+ CNAT_NFV9_TEMPLATE_LENGTH + \
+ CNAT_NFV9_MAX_SINGLE_RECORD_LENGTH)
+
+/*
+ * Let us put the maximum length of the netflow data to be 1400
+ */
+#define CNAT_NFV9_MAX_PKT_LENGTH 1400
+
+/*
+ * Data structures and defines to store NFV9 specific info
+ */
+#define CNAT_NFV9_INVALID_LOGGING_INDEX 0xffffffff
+
+/*
+ * Padding value between ADD and DELETE records. This can be atmost 3 bytes
+ */
+#define NFV9_PAD_VALUE (3)
+
+typedef struct {
+ /* NFV9 server specific info
+ * For now, it will maintain only package sequence count.
+ * Later it will maintain server address, port, etc.
+ * Though it currently has server address and port, it is only for
+ * cross refernce
+ */
+ u32 ipv4_address; /* Destination IP address of the collector */
+ u16 port; /* Destination port number of the collector */
+ u16 refresh_rate; /* Refresh rate in packets after which template is sent */
+ u16 timeout_rate; /* Timeout rate in seconds after which template is sent */
+ u16 ref_count; /* Num of instances using this data */
+ u32 sequence_num; /* Sequence number of the logging packet */
+ /*
+ * Keep track of the time and packets since last template send
+ */
+ u32 last_template_sent_time;
+ u32 pkts_since_last_template;
+ u8 template_sent; /* used while sending vrfid-name mapping */
+
+} nfv9_server_info_t;
+
+/*
+ * This structure store the Netflow Logging information on per NFv9
+ * collector basis. This structure is allocated from a pool and index
+ * to this structure is stored VRF MAP structures
+ */
+typedef struct {
+ /*
+ * nat64_id will be 0 for nat44 config and i_vrf_id, i_vrf will be 0
+ * for nat64 config. Nat64_id will be used while nat64 collector is
+ * search and i_vrf* for nat44 collector
+ */
+ /* Similarly for ds_lite, ds_lite_id will be used and nat64_id,
+ * ivrf_id shall be set to 0
+ */
+ u32 i_vrf_id; /* Inside VRF ID corresponding to this collector */
+ u16 i_vrf; /* Inside VRF (uidb_index) corresponding to this collector */
+ u16 nat64_id; /* NAT64 instance for to this collector */
+ u16 ds_lite_id; /* DS Lite instance for this collector */
+
+ /*
+ * This field determines the maximum size of the Netflow V9 information
+ * that can be stored in a logging packet
+ */
+ u16 max_length_minus_max_record_size;
+
+ /*
+ * Indicates if the entry is already deleted
+ */
+ u16 deleted;
+
+ u16 pkt_length; /* Length of the currently NFv9 information */
+ u16 record_length[MAX_RECORDS]; /* Length of delete record */
+ u16 total_record_count; /* Total number of records including templates */
+
+ u8 logging_policy;
+
+ /*
+ * Keep track of the time and packets since last template send
+ */
+ u32 last_template_sent_time;
+ u32 pkts_since_last_template;
+
+ /* Server info */
+ u32 server_index;
+
+ /*
+ * current logging context
+ */
+ vlib_buffer_t *current_logging_context;
+
+ /*
+ * Timestamp in UNIX seconds corresponding to when the current
+ * logging packet was created
+ */
+ u32 current_logging_context_timestamp;
+
+ /*
+ * Queued logging context waiting to be sent to the l3 infra node
+ */
+ vlib_buffer_t *queued_logging_context;
+
+ /*
+ * Headers corresponding to various records in this
+ * current nfv9 logging context
+ */
+ nfv9_header_t *nfv9_header;
+ cnat_nfv9_template_t *nfv9_template_header;
+ nfv9_dataflow_record_header_t *dataflow_header;
+ u8 *record[MAX_RECORDS];
+ u8 *next_data_ptr;
+ u8 last_record;
+ u32 nfv9_logging_next_index;
+ u32 ip4_input_node_index;
+ vlib_frame_t *f;
+ u32 *to_next;
+} cnat_nfv9_logging_info_t;
+
+
+/*
+ * Global structure for CGN APP configuration
+ */
+typedef struct {
+ /*
+ * Global NFv9 Logging Collector Index
+ */
+ u32 cnat_nfv9_global_collector_index;
+
+ /*
+ * Node index corresponding to the infra L3 output node
+ * to which the nfv9 logging node will send the packet
+ */
+ u16 cnat_nfv9_disp_node_index;
+
+ /*
+ * Whether we have initialized the NFv9 information
+ */
+ u8 cnat_nfv9_init_done;
+} cnat_nfv9_global_info_t;
+
+typedef enum {
+ cnat_nfv9_template_add_default,
+ cnat_nfv9_template_add_always
+} cnat_nfv9_template_add_flag_t;
+
+extern cnat_nfv9_template_t cnat_nfv9_template_info;
+
+extern cnat_nfv9_logging_info_t cnat_default_nfv9_logging_info;
+extern cnat_nfv9_logging_info_t *cnat_nfv9_logging_info_pool;
+
+extern cnat_nfv9_global_info_t cnat_nfv9_global_info;
+extern nfv9_server_info_t *nfv9_server_info_pool;
+
+/* #define DEBUG_NF_SERVER_CONFIG 1 */
+static inline void nfv9_delete_server_info(cnat_nfv9_logging_info_t *nfv9_info)
+{
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_info->server_index;
+ if(nfv9_info->server_index == EMPTY) {
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ PLATFORM_DEBUG_PRINT("Deleting empty server info\n");
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ return;
+ }
+
+ /* Check if this server is not used by anyone.. if not delete */
+ /* Caller of this function does not need it..so decrement ref count */
+ server->ref_count--;
+ if(!(server->ref_count)) {
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ PLATFORM_DEBUG_PRINT("Deleting nfv9 server %x, %d at %d\n",
+ server->ipv4_address,
+ server->port,
+ nfv9_info->server_index);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ pool_put(nfv9_server_info_pool, server);
+ nfv9_info->server_index = EMPTY;
+ }
+#ifdef DEBUG_NF_SERVER_CONFIG
+ else {
+ if(my_instance_number == 1) {
+ PLATFORM_DEBUG_PRINT("Not Deleting nfv9 server %x, %d rc %d\n",
+ server->ipv4_address,
+ server->port,
+ server->ref_count);
+ }
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ return;
+}
+
+void handle_pending_nfv9_pkts();
+#endif /* __CNAT_LOGGING_H__ */
diff --git a/vnet/vnet/vcgn/cnat_pcp_server.h b/vnet/vnet/vcgn/cnat_pcp_server.h
new file mode 100644
index 00000000000..c77c6a875f8
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_pcp_server.h
@@ -0,0 +1,398 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_pcp_server.h
+ *
+ * Copyright (c) 2009-2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_PCP_SERVER_H__
+#define __CNAT_PCP_SERVER_H__
+
+#include "dslite_defs.h"
+
+/* Debug utils of PCP */
+#define PCP_DBG(debug, ...) \
+ if(PREDICT_FALSE(cnat_pcp_debug_flag >= debug)) { \
+ printf("%s:%s:%d - ", \
+ __FILE__, __FUNCTION__, __LINE__);\
+ printf(__VA_ARGS__);\
+ printf("\n"); \
+ }
+
+#define PCP_DUMP_PDATA \
+ if(PREDICT_FALSE(cnat_pcp_debug_flag >= 100)) { \
+ printf("%s:%s:%d - \n", \
+ __FILE__, __FUNCTION__, __LINE__);\
+ printf("src - ip = %X, proto = %d, port = %d i_vrf = %d, o_vrf = %d\n", \
+ pcp_data.src_ip[3], pcp_data.proto, pcp_data.src_port, pcp_data.i_vrf, pcp_data.o_vrf); \
+ printf(" third party ip = %X\n", pcp_data.third_party_ip[3]); \
+ printf("map - ip = %X, port = %d \n", \
+ pcp_data.ext_ip[3], pcp_data.ext_port);\
+ printf("remote - ip = %X, port = %d \n", \
+ pcp_data.peer_ip[3], pcp_data.peer_port); \
+ printf("req life time = %d \n", pcp_data.req_lifetime); \
+ printf("drop = %d \n", pcp_data.drop);\
+ printf("udp_len = %d \n", pcp_data.udp_len); \
+ printf("pm = %p \n", pcp_data.pm); \
+ printf("cnat_proto = %X \n", pcp_data.cnat_proto); \
+ printf("inst_id = %X \n", pcp_data.inst_id); \
+ printf("======================================================\n"); \
+ }
+
+#define PCP_DUMP_PACKET(ip, len) pcp_hex_dump(ip, len)
+
+#ifdef TOBE_PORTED
+#define PCP_INCR(counter) pcp_counters.pcp_##counter++ ;
+#else
+#define PCP_INCR(counter)
+#endif
+
+typedef struct pcp_debug_counters {
+ u64 pcp_input;
+ u64 pcp_output;
+ u64 pcp_service_nat44;
+ u64 pcp_service_dslite;
+ /* below all are drops */
+ u64 pcp_drops;
+ u64 pcp_i2o_key_inuse;
+ u64 pcp_throttle_drops;
+ u64 pcp_udp_len;
+ u64 pcp_nrequest;
+ u64 pcp_min_udp_len;
+ u64 pcp_max_udp_len;
+ u64 pcp_mod4_len;
+ u64 pcp_invalid_3rd_len;
+ u64 pcp_invalid_option;
+ u64 pcp_version;
+ u64 pcp_invalid_opcode;
+ u64 pcp_invalid_client_ip;
+ u64 pcp_invalid_proto;
+ u64 pcp_invalid_port;
+ u64 pcp_invalid_vrfmap;
+ u64 pcp_invalid_ext_addr;
+ u64 pcp_out_addr_inuse;
+ u64 pcp_exact_match;
+ u64 pcp_exact_entry_created;
+ u64 pcp_exact_db_alloc_failed;
+ u64 pcp_udb_mismatch;
+ u64 pcp_noexact_db_allocated;
+ u64 pcp_static_entry_present;
+ u64 pcp_entry_deleted;
+ u64 pcp_3rd_party_option;
+
+ /* map counters */
+ u64 pcp_map_input;
+ u64 pcp_map_min_len;
+ u64 pcp_map_max_len;
+ u64 pcp_map_invalid_option;
+ u64 pcp_map_invalid_option_len;
+ u64 pcp_map_pref_fail_option;
+ u64 pcp_map_invalid_delete_req;
+ u64 pcp_map_delete_req;
+ u64 pcp_map_create_req;
+ u64 pcp_map_refresh;
+
+ /* peer counters */
+ u64 pcp_peer_input;
+ u64 pcp_peer_invalid_len;
+ u64 pcp_peer_delete_req;
+ u64 pcp_peer_create_req;
+ u64 pcp_peer_addr_mistmatch;
+ u64 pcp_peer_refresh;
+
+} pcp_debug_counters_t;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad[5];
+
+ /* better to have a group structures rather than individual
+ variables, any change in counters is will automatically
+ reflect here */
+ pcp_debug_counters_t counters;
+} pcp_show_counters_resp_t ;
+
+
+
+/* PCP opcodes */
+typedef enum pcp_opcode {
+ PCP_OPCODE_MAP = 1,
+ PCP_OPCODE_PEER = 2
+}pcp_opcode_t;
+
+
+/* PCP opcodes */
+typedef enum pcp_options {
+ PCP_OPTION_3RD_PARTY = 1,
+ PCP_OPTION_PREF_FAIL = 2,
+ PCP_OPTION_FILTER = 3
+} pcp_options_t;
+
+/* PCP Result codes */
+typedef enum pcp_result_codes {
+ PCP_SUCCESS = 0,
+ PCP_ERR_UNSUPP_VERSION = 1,
+ PCP_ERR_NOT_AUTHORIZED = 2,
+ PCP_ERR_MALFORMED_REQUEST = 3,
+ PCP_ERR_UNSUPP_OPCODE = 4,
+ PCP_ERR_UNSUPP_OPTION = 5,
+ PCP_ERR_MALFORMED_OPTION = 6,
+ PCP_ERR_NETWORK_FAILURE = 7,
+ PCP_ERR_NO_RESOURCES = 8,
+ PCP_ERR_UNSUPP_PROTOCOL = 9,
+ PCP_ERR_USER_EX_QUOTA = 10,
+ PCP_ERR_CANNOT_PROVIDE_EXTERNAL = 11,
+ PCP_ERR_ADDRESS_MISMATCH = 12,
+ PCP_ERR_EXCESSIVE_REMOTE_PEERS = 13
+} pcp_result_codes_t;
+
+#define PCP_DISABLED 0
+#define PCP_ENABLED 1
+
+#define PCP_DROP 1
+
+#define PCP_STATIC_LIFETIME 0xFFFFFFFF
+#define PCP_MAX_LIFETIME 0x00015180 /* 24 hours = 86400 seconds*/
+
+#define PCP_VERSION_SUPPORTED 1
+
+#define PCP_NO_PREF_FAIL_OPTION 0
+#define PCP_PREF_FAIL_OPTION 1
+
+#define CNAT_DEF_PCP_PORT 5351
+
+#define PCP_REQ_RESP_BIT 0x80
+#define PCP_RESPONSE(r_opcode) (r_opcode & PCP_REQ_RESP_BIT)
+#define PCP_REQUEST(r_opcode) !(PCP_RESPONSE(r_opcode))
+
+#define PCP_REQ_OPCODE(r_opcode) (r_opcode & 0x7F)
+
+/* 24 bytes */
+#define PCP_COMMON_HDR_LEN sizeof(pcp_request_t)
+
+/* 8 bytes */
+#define UDP_HDR_LEN sizeof(udp_hdr_type_t)
+
+#define PCP_PREF_FAIL_OPTION_SIZE \
+ sizeof(pcp_prefer_fail_option_t)
+
+#define PCP_3RD_PARTY_OPTION_SIZE \
+ sizeof(pcp_3rd_party_option_t)
+
+#define PCP_MIN_LEN PCP_COMMON_HDR_LEN
+
+/* 24+8=32 bytes */
+#define PCP_MIN_UDP_LEN (PCP_MIN_LEN + UDP_HDR_LEN)
+
+#define PCP_MAX_LEN 1024
+
+/* 1024+8 = 1032 bytes */
+#define PCP_MAX_UDP_LEN (PCP_MAX_LEN + UDP_HDR_LEN)
+
+/* 24+ 24 = 48 bytes */
+#define PCP_MAP_OPCODE_MIN_LEN (PCP_COMMON_HDR_LEN + \
+ sizeof( pcp_map_option_specific_data_t))
+
+/* 24 + 44 = 68 bytes */
+#define PCP_PEER_OPCODE_MIN_LEN (PCP_COMMON_HDR_LEN + \
+ sizeof( pcp_peer_option_specific_data_t))
+
+/* 48 + 8 = 56 bytes */
+#define PCP_MAP_OPCODE_MIN_UDP_LEN (PCP_MAP_OPCODE_MIN_LEN + \
+ UDP_HDR_LEN )
+
+#define PCP_GET_MAP_OPTION_OFFSET(req) \
+ ((u8*)req + PCP_MAP_OPCODE_MIN_LEN)
+
+#define PCP_GET_PEER_OPTION_OFFSET(req) \
+ ((u8*)req + PCP_PEER_OPCODE_MIN_LEN)
+
+
+#define PCP_REQ_TOTAL_LEN(udp) (udp->udp_length - \
+ UDP_HDR_LEN)
+/* 56 + 4 = 60 bytes */
+#define PCP_MAP_OPCODE_PREF_FAIL_OPTION_LEN \
+ (PCP_MAP_OPCODE_MIN_UDP_LEN + \
+ sizeof(pcp_prefer_fail_option_t))
+
+
+/* 68 + 8 = 76 bytes */
+#define PCP_PEER_OPCODE_MIN_UDP_LEN (PCP_PEER_OPCODE_MIN_LEN + \
+ UDP_HDR_LEN)
+
+#define PCP_MUST_OPTION(option_code) (option_code & 0x80)
+
+
+
+/* 56 + 20 = 76*/
+#define PCP_DSLITE_MAP_OPCODE_MIN_UDP_LEN \
+ ( PCP_MAP_OPCODE_MIN_UDP_LEN + \
+ PCP_3RD_PARTY_OPTION_SIZE)
+
+/* 60 + 20 = 80 */
+#define PCP_DSLITE_MAP_OPCODE_MAX_UDP_LEN \
+ ( PCP_MAP_OPCODE_PREF_FAIL_OPTION_LEN + \
+ PCP_3RD_PARTY_OPTION_SIZE)
+
+/* 76 + 20 = 96 */
+#define PCP_DSLITE_PEER_OPCODE_MIN_UDP_LEN \
+ ( PCP_PEER_OPCODE_MIN_UDP_LEN + \
+ PCP_3RD_PARTY_OPTION_SIZE)
+
+
+#define PCP_SET_CNAT_PROTO(proto) \
+ pcp_data.cnat_proto = (proto == TCP_PROT) ? CNAT_TCP: \
+ (proto == UDP_PROT)? CNAT_UDP : CNAT_ICMP;
+
+#define PCP_SET_REQ_LIFETIME() \
+ if(pcp_data.db->flags & CNAT_DB_FLAG_STATIC_PORT) { \
+ pcp_data.db->proto_data.seq_pcp.pcp_lifetime = \
+ PCP_STATIC_LIFETIME; \
+ pcp_data.req_lifetime = PCP_STATIC_LIFETIME; \
+ } else { \
+ pcp_data.db->proto_data.seq_pcp.pcp_lifetime = \
+ pcp_data.req_lifetime + cnat_current_time ; \
+ }
+
+
+/* per second not more than PCP_THROTTLE_LIMIT
+ * delete requests will be handled.
+ * this excludes , specific entries, in which
+ * only one entry needs to be deleted
+ */
+#define PCP_THROTTLE_LIMIT 2
+
+typedef struct pcp_request {
+ u8 ver;
+ u8 r_opcode;
+ u16 reserved;
+ u32 req_lifetime;
+ u32 ip[4]; /* ipv4 will be represented
+ by the ipv4 mapped ipv6 */
+} pcp_request_t;
+
+typedef struct pcp_response {
+ u8 ver;
+ u8 r_opcode;
+ u8 reserved;
+ u8 result_code;
+ u32 lifetime;
+ u32 epochtime;
+ u32 reserved1[3];
+} pcp_response_t;
+
+
+typedef struct pcp_options_hdr {
+ u8 code;
+ u8 reserved;
+ u16 len;
+ u8 data[0];
+} pcp_options_hdr_t;
+
+
+/* same for both request and response */
+typedef struct pcp_map_option_specific_data {
+ u8 protocol;
+ u8 reserved[3];
+ u16 int_port;
+ u16 ext_port;
+ u32 ext_ip[4]; /* ipv4 will be represnted
+ by the ipv4 mapped ipv6 */
+} pcp_map_option_specific_data_t;
+
+/* same for both request and response */
+typedef struct pcp_peer_option_specific_data {
+ u8 protocol;
+ u8 reserved[3];
+ u16 int_port;
+ u16 ext_port;
+ u32 ext_ip[4]; /* ipv4 will be represented
+ by the ipv4 mapped ipv6 */
+ u16 peer_port;
+ u16 reserved1;
+ u32 peer_ip[4];
+} pcp_peer_option_specific_data_t;
+
+typedef struct pcp_prefer_fail_option {
+ u8 option;
+ u8 reserved;
+ u16 len;
+} pcp_prefer_fail_option_t;
+
+
+typedef struct pcp_3rd_party_option{
+ u8 option;
+ u8 reserved;
+ u16 len;
+ u32 ip[4];
+} pcp_3rd_party_option_t;
+
+/* structure used as pipeline data */
+
+typedef struct pcp_pipeline_data {
+
+ union {
+
+ u8 *p;
+ ipv4_header *ip ;
+ ipv6_header_t *ipv6 ;
+
+ } l3addr;
+
+ udp_hdr_type_t *udp;
+ pcp_request_t *req;
+ pcp_response_t *resp;
+ pcp_opcode_t opcode;
+ u32 src_ip[4];
+ u16 src_port;
+ u8 proto;
+ u16 i_vrf;
+ u16 o_vrf;
+ u32 ext_ip[4];
+ u16 ext_port;
+ u32 third_party_ip[4];
+
+ /* valid for peer opcode */
+ u32 peer_ip[4];
+ u32 peer_port;
+ u32 req_lifetime;
+ u32 udp_len;
+ pcp_options_t pref_fail;
+ pcp_options_t third_party;
+ u8 *option_spec;
+ pcp_result_codes_t ret_code;
+ cnat_portmap_v2_t *pm;
+ cnat_main_db_entry_t *db;
+ cnat_vrfmap_t *vrfmap;
+ dslite_table_entry_t *inst_ptr;
+ u16 inst_id;
+ u32 flags;
+ u16 cnat_proto;
+
+ /* is packet needs to be dropped ? */
+ u8 drop;
+ /* nat44, dslite, nat64 */
+#define PCP_SERVICE_NAT44 1
+#define PCP_SERVICE_DSLITE 2
+#define PCP_SERVICE_NAT64 3
+ u8 service_type;
+
+#define PCP_REQ_ENTRY_PRESENT 1
+#define PCP_REQ_EXT_MAP_PRESENT 1
+ u8 state;
+} pcp_pipeline_data_t;
+
+#endif /* __CNAT_PCP_sERVER_H__ */
diff --git a/vnet/vnet/vcgn/cnat_ports.c b/vnet/vnet/vcgn/cnat_ports.c
new file mode 100644
index 00000000000..4437865aaee
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ports.c
@@ -0,0 +1,1113 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_ports.c - port allocator
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/bitmap.h>
+
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_logging.h"
+#include "spp_timers.h"
+#include "platform_common.h"
+#include "cgn_bitmap.h"
+#include "spp_platform_trace_log.h"
+#include "cnat_ports.h"
+
+#if 1 /* TOBE_PORTED */
+/* Following is defined elsewhere. */
+#define msg_spp_err(s) \
+do { \
+ fprintf(stderr,(i8 *)s); \
+ fputs("\n", stderr); \
+} while(0);
+#endif
+
+
+#define PM_90_PERCENT_USE 58980
+/*
+ * instance number provisioned from HW
+ */
+u8 my_instance_number = 0;
+
+typedef struct {
+ u32 cached_next_index;
+ /* $$$$ add data here */
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} cnat_ports_main_t;
+
+cnat_ports_main_t cnat_ports_main;
+
+static u32 rseed_port; /* random number generator seed */
+
+void
+cnat_db_dump_portmap_for_vrf (u32 vrfmap_index)
+{
+ u32 i, pm_len;
+ cnat_vrfmap_t *my_vrfmap = cnat_map_by_vrf + vrfmap_index;
+ cnat_portmap_v2_t *pm, *my_pm __attribute__((unused));
+
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+
+ PLATFORM_DEBUG_PRINT("pm %d: IPv4 Addr 0x%x - in use %d private_ip_users_count %d\n",
+ i, my_pm->ipv4_address, my_pm->inuse,
+ my_pm->private_ip_users_count);
+
+ PLATFORM_DEBUG_PRINT("pm %d: IPv4 Addr 0x%x - in use %d "
+ "private_ip_users_count %d\n",
+ i, my_pm->ipv4_address, my_pm->inuse,
+ my_pm->private_ip_users_count);
+ }
+}
+
+void
+cnat_db_dump_portmaps ()
+{
+ u32 i, vrfmap_index;
+
+ for (i = 0; i < CNAT_MAX_VRFMAP_ENTRIES; i++) {
+ vrfmap_index = vrf_map_array[i];
+
+ if (vrfmap_index == VRF_MAP_ENTRY_EMPTY) {
+ continue;
+ }
+
+ PLATFORM_DEBUG_PRINT("\n\nDumping the port map for uidb_index %d\n", i);
+ cnat_db_dump_portmap_for_vrf(vrfmap_index);
+ }
+}
+
+#ifndef NO_BULK_LOGGING
+static int check_if_stat_alloc_ok_for_bulk(cnat_portmap_v2_t *pm,
+ u16 i_port, bulk_alloc_size_t bulk_size,
+ u16 static_port_range)
+{
+ uword bit_test_result;
+ if(BULK_ALLOC_SIZE_NONE == bulk_size) return 1; /* No issues */
+
+ if(i_port < static_port_range) return 1; /* we don't want bulk */
+
+ i_port = (i_port/bulk_size) * bulk_size;
+ bit_test_result = cgn_clib_bitmap_check_if_all(pm->bm, i_port, bulk_size);
+ return(bit_test_result);
+}
+#else /* dummy */
+inline static int check_if_stat_alloc_ok_for_bulk(cnat_portmap_v2_t *pm,
+ u16 i_port, bulk_alloc_size_t bulk_size,
+ u16 static_port_range)
+{
+ return 1;
+}
+#endif /* NO_BULK_LOGGING */
+/*
+ * cnat_port_alloc_static_v2
+ * public ipv4 address/port allocator for Static Port commands
+ * tries to allocate same outside port as inside port
+ */
+cnat_errno_t
+cnat_static_port_alloc_v2 (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 i_ipv4_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u16 ip_n_to_1
+ )
+{
+ u32 i, hash_value, my_index, found, max_attempts;
+ u16 start_bit, new_port;
+ cnat_portmap_v2_t *my_pm = 0;
+ u32 pm_len = vec_len(pm);
+ uword bit_test_result;
+
+#ifndef NO_BULK_LOGGING
+ *nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+#endif
+
+ if (PREDICT_FALSE(pm_len == 0)) {
+ return (CNAT_NO_POOL_ANY);
+ }
+
+ switch (atype) {
+
+ case PORT_ALLOC_ANY:
+
+ found = 0;
+
+ /*
+ * Try to hash the IPv4 address to get an index value to select the pm
+ */
+ hash_value = (i_ipv4_address & 0xffff) ^
+ ((i_ipv4_address > 16) & 0xffff);
+
+ /*
+ * If pm_len <= 256, compact the hash to 8 bits
+ */
+ if (PREDICT_TRUE(pm_len <= 256)) {
+ hash_value = (hash_value & 0xff) ^ ((hash_value > 8) & 0xff);
+ }
+
+ /*
+ * Ensure that the hash value is in the range 0 .. (pm_len-1)
+ */
+ my_index = hash_value % pm_len;
+
+ for (i = 0; i < PORT_PROBE_LIMIT; i++) {
+ my_pm = pm + my_index;
+ if(PREDICT_TRUE(ip_n_to_1)) {
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < ip_n_to_1)) {
+ /*
+ * Try to find a PM with atlest 33% free and my_port free
+ */
+ if (PREDICT_TRUE((my_pm->inuse < ((BITS_PER_INST*2)/3)) &&
+ clib_bitmap_get_no_check(my_pm->bm,
+ i_port) == 1)
+#ifndef NO_BULK_LOGGING
+ && check_if_stat_alloc_ok_for_bulk(my_pm, i_port,
+ bulk_size,
+ static_port_range)
+#endif
+ ) {
+ found = 1;
+ break;
+ }
+ }
+
+ } else {
+ /*
+ * Try to find a PM with atlest 33% free and my_port free
+ */
+ if (PREDICT_TRUE((my_pm->inuse < ((BITS_PER_INST*2)/3)) &&
+ clib_bitmap_get_no_check(my_pm->bm,
+ i_port) == 1)
+#ifndef NO_BULK_LOGGING
+ && check_if_stat_alloc_ok_for_bulk(my_pm, i_port,
+ bulk_size,
+ static_port_range)
+#endif
+ ) {
+ found = 1;
+ break;
+ }
+ }
+ my_index = (my_index + 1) % pm_len;
+ }
+
+ /*
+ * If not found do it the hard way .
+ * "hard" way, best-fit.
+ */
+ if (!found) {
+ u32 min_inuse_any, min_inuse_myport;
+ u32 min_index_any, min_index_myport;
+
+ min_inuse_any = min_inuse_myport = PORTS_PER_ADDR + 1;
+ min_index_any = min_index_myport = ~0;
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+ if(PREDICT_TRUE(ip_n_to_1)) {
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < ip_n_to_1)) {
+ if (PREDICT_FALSE(my_pm->inuse < min_inuse_any)) {
+ min_inuse_any = my_pm->inuse;
+ min_index_any = my_pm - pm;
+ }
+ if (PREDICT_FALSE(my_pm->inuse < min_inuse_myport)) {
+ if (PREDICT_TRUE(clib_bitmap_get_no_check(
+ my_pm->bm,i_port) == 1)
+#ifndef NO_BULK_LOGGING
+ && check_if_stat_alloc_ok_for_bulk(my_pm,
+ i_port,bulk_size,static_port_range)
+#endif
+ ) {
+ min_inuse_myport = my_pm->inuse;
+ min_index_myport = my_pm - pm;
+ }
+ }
+
+ }
+
+ } else {
+ if (PREDICT_FALSE(my_pm->inuse < min_inuse_any)) {
+ min_inuse_any = my_pm->inuse;
+ min_index_any = my_pm - pm;
+ }
+ if (PREDICT_FALSE(my_pm->inuse < min_inuse_myport)) {
+ if (PREDICT_TRUE(clib_bitmap_get_no_check(
+ my_pm->bm, i_port) == 1)
+#ifndef NO_BULK_LOGGING
+ && check_if_stat_alloc_ok_for_bulk(my_pm, i_port,
+ bulk_size, static_port_range)
+#endif
+ ) {
+ min_inuse_myport = my_pm->inuse;
+ min_index_myport = my_pm - pm;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we have an exactly matching PM that has
+ * myport free. If so use it. If no such PM is
+ * available, use any PM
+ */
+ if (PREDICT_TRUE(min_inuse_myport < PORTS_PER_ADDR)) {
+ my_pm = pm + min_index_myport;
+ my_index = min_index_myport;
+ found = 1;
+ } else if (PREDICT_TRUE(min_inuse_any < PORTS_PER_ADDR)) {
+ my_pm = pm + min_index_any;
+ my_index = min_index_any;
+ found = 1;
+ }
+ }
+
+ if (!found) {
+ return (CNAT_NO_PORT_ANY);
+ }
+ break;
+
+ case PORT_ALLOC_DIRECTED:
+ my_index = *index;
+ if (PREDICT_FALSE(my_index > pm_len)) {
+ return (CNAT_INV_PORT_DIRECT);
+ }
+ my_pm = pm + my_index;
+ break;
+
+ default:
+ return (CNAT_ERR_PARSER);
+ }
+
+ /* Allocate a matching port if possible */
+ start_bit = i_port;
+ found = 0;
+ max_attempts = BITS_PER_INST;
+#ifndef NO_BULK_LOGGING
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) &&
+ (i_port >= static_port_range)) {
+ start_bit = (start_bit/bulk_size) * bulk_size;
+ max_attempts = BITS_PER_INST/bulk_size;
+ }
+#endif /* NO_BULK_LOGGING */
+
+ for (i = 0; i < max_attempts; i++) {
+#ifndef NO_BULK_LOGGING
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) &&
+ (i_port >= static_port_range)) {
+ bit_test_result = cgn_clib_bitmap_check_if_all(my_pm->bm,
+ start_bit, bulk_size);
+ }
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ bit_test_result = clib_bitmap_get_no_check(my_pm->bm, start_bit);
+
+ if (PREDICT_TRUE(bit_test_result)) {
+#ifndef NO_BULK_LOGGING
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) &&
+ (i_port >= static_port_range)) {
+ *nfv9_log_req = start_bit;
+ if(i==0) new_port = i_port; /* First go */
+ else {
+ new_port = bit2port(start_bit);
+ if (pair_type == PORT_S_ODD && (new_port & 0x1) == 0)
+ new_port++;
+ }
+ found = 1;
+ break;
+ }
+ else {
+#endif /* NO_BULK_LOGGING */
+ new_port = bit2port(start_bit);
+ if (pair_type == PORT_S_ODD) {
+ if ((new_port & 0x1) == 1) {
+ found = 1;
+ break;
+ }
+ } else if (pair_type == PORT_S_EVEN) {
+ if ((new_port & 0x1) == 0) {
+ found = 1;
+ break;
+ }
+ } else {
+ found = 1;
+ break;
+ }
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+ }
+#ifndef NO_BULK_LOGGING
+ if((BULK_ALLOC_SIZE_NONE != bulk_size) &&
+ (i_port >= static_port_range))
+ start_bit = (start_bit + bulk_size) % BITS_PER_INST;
+ else {
+#endif /* NO_BULK_LOGGING */
+ start_bit = (start_bit + 1) % BITS_PER_INST;
+ if(PREDICT_FALSE(start_bit == 0)) {
+ start_bit = 1; /* Port 0 is invalid, so start from 1 */
+ }
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+ } /* End of for loop */
+
+ if (!found) {
+ /* Port allocation failure */
+ if (atype == PORT_ALLOC_DIRECTED) {
+ return (CNAT_NOT_FOUND_DIRECT);
+ } else {
+ return (CNAT_NOT_FOUND_ANY);
+ }
+ }
+
+ /* Accounting */
+ cgn_clib_bitmap_clear_no_check(my_pm->bm, new_port);
+ (my_pm->inuse)++;
+
+ *index = my_pm - pm;
+ *o_ipv4_address = my_pm->ipv4_address;
+
+ *o_port = new_port;
+
+ return (CNAT_SUCCESS);
+}
+
+/*
+ * Try to allocate a portmap structure based on atype field
+ */
+cnat_portmap_v2_t *
+cnat_dynamic_addr_alloc_from_pm (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ cnat_errno_t *err,
+ u16 ip_n_to_1,
+ u32 *rseed_ip)
+{
+ u32 i, pm_len;
+ int my_index;
+ int min_inuse, min_index;
+
+ cnat_portmap_v2_t *my_pm = 0;
+ *err = CNAT_NO_POOL_ANY;
+
+ pm_len = vec_len(pm);
+
+ switch(atype) {
+ case PORT_ALLOC_ANY:
+ if (PREDICT_FALSE(pm_len == 0)) {
+ my_pm = 0;
+ *err = CNAT_NO_POOL_ANY;
+ goto done;
+ }
+
+ /* "Easy" way, first address with at least 200 free ports */
+ for (i = 0; i < PORT_PROBE_LIMIT; i++) {
+ *rseed_ip = randq1(*rseed_ip);
+ my_index = (*rseed_ip) % pm_len;
+ my_pm = pm + my_index;
+ if (PREDICT_FALSE(ip_n_to_1)) {
+ if(PREDICT_TRUE(ip_n_to_1 == 1)) {
+ if (PREDICT_FALSE(0 == my_pm->inuse)) {
+ goto done;
+ }
+ } else {
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < ip_n_to_1)) {
+ if (PREDICT_FALSE(my_pm->inuse < ((BITS_PER_INST*2)/3))) {
+ goto done;
+ }
+ }
+ }
+ } else {
+ if (PREDICT_FALSE(my_pm->inuse < ((BITS_PER_INST*2)/3))) {
+ goto done;
+ }
+ }
+ }
+
+ /* "hard" way, best-fit. $$$$ Throttle complaint */
+ min_inuse = PORTS_PER_ADDR + 1;
+ min_index = ~0;
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+ if (PREDICT_FALSE(ip_n_to_1)) {
+ if(PREDICT_TRUE(ip_n_to_1 == 1)) {
+ if (PREDICT_FALSE(!my_pm->inuse)) {
+ min_inuse = my_pm->inuse;
+ min_index = my_pm - pm;
+ }
+ } else {
+ if(PREDICT_TRUE(my_pm->private_ip_users_count < ip_n_to_1)) {
+ if (PREDICT_TRUE(my_pm->inuse < min_inuse)) {
+ min_inuse = my_pm->inuse;
+ min_index = my_pm - pm;
+ }
+
+ }
+ }
+
+ } else {
+ if (PREDICT_TRUE(my_pm->inuse < min_inuse)) {
+ min_inuse = my_pm->inuse;
+ min_index = my_pm - pm;
+ }
+ }
+ }
+
+ if (PREDICT_TRUE(min_inuse < PORTS_PER_ADDR)) {
+ my_pm = pm + min_index;
+ my_index = min_index;
+ goto done;
+ }
+
+ /* Completely out of ports */
+#ifdef DEBUG_PRINTF_ENABLED
+ PLATFORM_DEBUG_PRINT("%s out of ports\n", __FUNCTION__);
+#endif
+
+ my_pm = 0;
+ *err = CNAT_NO_PORT_ANY;
+ break;
+
+
+ case PORT_ALLOC_DIRECTED:
+ //ASSERT(*index < pm_len);
+ if (PREDICT_FALSE(*index > pm_len)) {
+ my_pm = 0;
+ *err = CNAT_INV_PORT_DIRECT;
+ goto done;
+ }
+ my_pm = pm + *index;
+ my_index = *index;
+ break;
+
+ default:
+ msg_spp_err("bad allocation type in cnat_port_alloc");
+ my_pm = 0;
+ *err = CNAT_ERR_PARSER;
+ break;
+ }
+
+ done:
+ if (PREDICT_FALSE(my_pm == NULL)) {
+ return (my_pm);
+ }
+
+ if (PREDICT_FALSE(my_pm->inuse >= BITS_PER_INST)) {
+ my_pm = 0;
+ if (atype == PORT_ALLOC_DIRECTED) {
+ *err = CNAT_BAD_INUSE_DIRECT;
+ } else {
+ *err = CNAT_BAD_INUSE_ANY;
+ }
+ }
+
+ return (my_pm);
+}
+
+
+/*
+ * cnat_port_alloc_v2
+ * public ipv4 address/port allocator for dynamic ports
+ *
+ * 200K users / 20M translations means vec_len(cnat_portmap) will be
+ * around 300.
+ *
+ */
+cnat_errno_t
+cnat_dynamic_port_alloc_v2 (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u16 ip_n_to_1,
+ u32 *rseed_ip
+ )
+{
+ int i;
+ cnat_errno_t my_err = CNAT_NO_POOL_ANY;
+ cnat_portmap_v2_t *my_pm = 0;
+ u16 start_bit;
+ u16 new_port;
+ uword bit_test_result;
+ uword max_trys_to_find_port;
+
+ ASSERT(index);
+ ASSERT(o_ipv4_address);
+ ASSERT(o_port);
+
+ my_pm = cnat_dynamic_addr_alloc_from_pm(pm, atype, index, &my_err, ip_n_to_1,
+ rseed_ip);
+
+ if (PREDICT_FALSE(my_pm == NULL)) {
+ return (my_err);
+ }
+ if(PREDICT_FALSE(my_pm->dyn_full == 1)) {
+ if (atype == PORT_ALLOC_DIRECTED) {
+ return (CNAT_NOT_FOUND_DIRECT);
+ } else {
+ return (CNAT_NOT_FOUND_ANY);
+ }
+ }
+
+#if DEBUG > 1
+ PLATFORM_DEBUG_PRINT("ALLOC_PORT_V2: My_Instance_Number %d: IP addr 0x%x, Inuse %d\n",
+ my_instance_number, my_pm->ipv4_address, my_pm->inuse);
+#endif
+
+ rseed_port = randq1(rseed_port);
+
+ /*
+ * Exclude the static port range for allocating dynamic ports
+ */
+ start_bit = (rseed_port) % (BITS_PER_INST - static_port_range);
+ start_bit = start_bit + static_port_range;
+
+#ifndef NO_BULK_LOGGING
+ *nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+ if(BULK_ALLOC_SIZE_NONE != bulk_size)
+ {
+ /* We need the start port of the range to be alined on integer multiple
+ * of bulk_size */
+ max_trys_to_find_port = BITS_PER_INST/bulk_size;
+ start_bit= ((start_bit + bulk_size -1)/bulk_size) * bulk_size;
+ }
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ max_trys_to_find_port = BITS_PER_INST;
+
+ /* Allocate a random port / port-pair */
+ for (i = 0; i < max_trys_to_find_port; i++) {
+
+ /* start_bit is only a u16.. so it can rollover and become zero */
+ if (PREDICT_FALSE((start_bit >= BITS_PER_INST) ||
+ (start_bit < static_port_range))) {
+ start_bit = static_port_range;
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ start_bit= ((start_bit + bulk_size -1)/bulk_size) * bulk_size;
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ }
+ /* Scan forward from random position */
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ bit_test_result = cgn_clib_bitmap_check_if_all(my_pm->bm,
+ start_bit, bulk_size);
+ }
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ bit_test_result = clib_bitmap_get_no_check(my_pm->bm, start_bit);
+
+ if (PREDICT_TRUE(bit_test_result)) {
+ new_port = bit2port(start_bit);
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size)
+ *nfv9_log_req = new_port;
+#endif
+ if ((pair_type == PORT_S_ODD) &&
+ (!(new_port & 0x1))) {
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ start_bit++; /* Just use the next one in the bulk range */
+ new_port++;
+ goto found2;
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ goto notfound;
+ } else if ((pair_type == PORT_S_EVEN) &&
+ (new_port & 0x1)) {
+ goto notfound;
+ }
+
+ /* OK we got one or two suitable ports */
+ goto found2;
+ }
+
+ notfound:
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size)
+ start_bit += bulk_size;
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ start_bit++;
+
+ } /* end of for loop */
+
+ /* Completely out of ports */
+
+ /* Port allocation failure */
+ /* set dyn_full flag. This would be used to verify
+ * for further dyn session before searching for port
+ */
+ if (atype == PORT_ALLOC_DIRECTED) {
+ my_pm->dyn_full = 1;
+ return (CNAT_NOT_FOUND_DIRECT);
+ } else {
+ my_pm->dyn_full = 1;
+ return (CNAT_NOT_FOUND_ANY);
+ }
+
+
+ found2:
+
+ /* Accounting */
+ cgn_clib_bitmap_clear_no_check (my_pm->bm, start_bit);
+ (my_pm->inuse)++;
+
+ *index = my_pm - pm;
+ *o_ipv4_address = my_pm->ipv4_address;
+
+ *o_port = new_port;
+ return (CNAT_SUCCESS);
+}
+
+#ifdef TOBE_PORTED
+/*
+ * cnat_alloc_port_from_pm
+ * Given a portmap structure find port/port_pair that are free
+ *
+ * The assumption in this function is that bit in bm corresponds
+ * to a port number. This is TRUE and hence there is no call
+ * to the function bit2port here, though it is done in other
+ * places in this file.
+ *
+ */
+static u32
+cnat_alloc_port_from_pm (
+ u32 start_port,
+ u32 end_port,
+ cnat_portmap_v2_t *my_pm,
+ port_pair_t pair_type
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif /* #ifnded NO_BULK_ALLOCATION */
+ )
+{
+ u32 i;
+ u32 start_bit;
+ u32 total_ports = end_port - start_port + 1;
+ uword bit_test_result;
+ uword max_trys_to_find_port;
+
+ rseed_port = randq1(rseed_port);
+
+ start_bit = rseed_port % total_ports;
+ start_bit = start_bit + start_port;
+#ifndef NO_BULK_LOGGING
+ *nfv9_log_req = BULK_ALLOC_NOT_ATTEMPTED;
+ if(BULK_ALLOC_SIZE_NONE != bulk_size)
+ {
+ /* We need the start port of the range to be alined on integer multiple
+ * of bulk_size */
+ max_trys_to_find_port = total_ports/bulk_size;
+ start_bit= ((start_bit + bulk_size -1)/bulk_size) * bulk_size;
+ }
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ max_trys_to_find_port = total_ports;
+
+ /* Allocate a random port / port-pair */
+ for (i = 0; i < max_trys_to_find_port; i++) {
+ /* start_bit is only a u16.. so it can rollover and become zero */
+ if (PREDICT_FALSE((start_bit >= end_port) ||
+ (start_bit < start_port))) {
+ start_bit = start_port;
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ start_bit= ((start_bit + bulk_size -1)/bulk_size) * bulk_size;
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ }
+
+ /* Scan forward from random position */
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ bit_test_result = cgn_clib_bitmap_check_if_all(my_pm->bm,
+ start_bit, bulk_size);
+ }
+ else
+#endif /* #ifndef NO_BULK_LOGGING */
+ bit_test_result = clib_bitmap_get_no_check(my_pm->bm, start_bit);
+
+ if (PREDICT_TRUE(bit_test_result)) {
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ /* Got the entire bulk range */
+ *nfv9_log_req = bit2port(start_bit);
+ return start_bit;
+ } else {
+#endif /* #ifndef NO_BULK_LOGGING */
+ /*
+ * For PORT_PAIR, first port has to be Even
+ * subsequent port <= end_port
+ * subsequent port should be unallocated
+ */
+ if ((start_bit & 0x1) ||
+ ((start_bit + 1) > end_port) ||
+ (clib_bitmap_get_no_check(my_pm->bm,
+ (start_bit + 1)) == 0)) {
+ goto notfound;
+ }
+ return (start_bit);
+#ifndef NO_BULK_LOGGING
+ }
+#endif /* #ifndef NO_BULK_LOGGING */
+ } /* if( free port found ) */
+
+notfound:
+#ifndef NO_BULK_LOGGING
+ if(BULK_ALLOC_SIZE_NONE != bulk_size) {
+ start_bit += bulk_size;
+ } else
+#endif /* #ifndef NO_BULK_LOGGING */
+ start_bit++;
+
+ }
+ return (BITS_PER_INST);
+}
+
+/*
+ * cnat_dynamic_port_alloc_rtsp
+ * public ipv4 address/port allocator for dynamic ports
+ *
+ * 200K users / 20M translations means vec_len(cnat_portmap) will be
+ * around 300.
+ *
+ */
+
+cnat_errno_t
+cnat_dynamic_port_alloc_rtsp (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u16 start_range,
+ u16 end_range,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u32 *rseed_ip
+ )
+{
+
+ u32 current_timestamp;
+ cnat_errno_t my_err = CNAT_NO_POOL_ANY;
+ cnat_portmap_v2_t *my_pm = 0;
+ u32 alloc_bit;
+
+ ASSERT(index);
+ ASSERT(o_ipv4_address);
+ ASSERT(o_port);
+
+ my_pm = cnat_dynamic_addr_alloc_from_pm(pm, atype, index, &my_err, 0,rseed_ip);
+
+ if (PREDICT_FALSE(my_pm == NULL)) {
+ return (my_err);
+ }
+
+#if DEBUG > 1
+ PLATFORM_DEBUG_PRINT("ALLOC_PORT_V2: My_Instance_Number %d: IP addr 0x%x, Inuse %d\n",
+ my_instance_number, my_pm->ipv4_address, my_pm->inuse);
+#endif
+
+ alloc_bit =
+ cnat_alloc_port_from_pm(start_range, end_range, my_pm, pair_type
+#ifndef NO_BULK_LOGGING
+ , bulk_size, nfv9_log_req
+#endif /* #ifndef NO_BULK_LOGGING */
+ );
+
+ if (alloc_bit < BITS_PER_INST) {
+ if (pair_type == PORT_PAIR) {
+ /* Accounting */
+ cgn_clib_bitmap_clear_no_check (my_pm->bm, alloc_bit);
+ cgn_clib_bitmap_clear_no_check (my_pm->bm, alloc_bit+1);
+ (my_pm->inuse) += 2;
+ } else {
+ /* Accounting */
+ cgn_clib_bitmap_clear_no_check (my_pm->bm, alloc_bit);
+ (my_pm->inuse)++;
+ }
+
+ *index = my_pm - pm;
+ *o_ipv4_address = my_pm->ipv4_address;
+
+ *o_port = bit2port(alloc_bit);;
+
+ return (CNAT_SUCCESS);
+ }
+
+ /* Completely out of ports */
+ current_timestamp = spp_trace_log_get_unix_time_in_seconds();
+ if (PREDICT_FALSE((current_timestamp - my_pm->last_sent_timestamp) >
+ 1000)) {
+ spp_printf(CNAT_NO_EXT_PORT_AVAILABLE, 0, NULL);
+ my_pm->last_sent_timestamp = current_timestamp;
+ }
+
+
+ /* Port allocation failure */
+ if (atype == PORT_ALLOC_DIRECTED) {
+ return (CNAT_NOT_FOUND_DIRECT);
+ } else {
+ return (CNAT_NOT_FOUND_ANY);
+ }
+}
+#else
+cnat_errno_t
+cnat_dynamic_port_alloc_rtsp (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u16 start_range,
+ u16 end_range,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u32 *rseed_ip
+ )
+{
+ return (CNAT_NOT_FOUND_ANY);
+}
+#endif
+
+
+/*
+ * cnat_mapped_static_port_alloc_v2
+ * /
+ */
+cnat_errno_t
+cnat_mapped_static_port_alloc_v2 (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ u32 ipv4_address,
+ u16 port
+#ifndef NO_BULK_LOGGING
+ , int *nfv9_log_req,
+ bulk_alloc_size_t bulk_size
+#endif
+ , u16 ip_n_to_1
+ )
+{
+ int i;
+ u32 pm_len;
+ u16 bm_bit;
+ cnat_portmap_v2_t *my_pm = 0;
+ u32 my_index;
+
+ ASSERT(index);
+
+ /*
+ * Map the port to the bit in the pm bitmap structure.
+ * Note that we use ports from 1024..65535, so
+ * port number x corresponds to (x-1024) position in bitmap
+ */
+ bm_bit = port2bit(port);
+
+ pm_len = vec_len(pm);
+
+ switch(atype) {
+ case PORT_ALLOC_ANY:
+ if (PREDICT_FALSE(pm_len == 0)) {
+ return (CNAT_NO_POOL_ANY);
+ }
+
+ /*
+ * Find the pm that is allocated for this translated IP address
+ */
+ my_index = pm_len;
+
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+ if (PREDICT_FALSE(my_pm->ipv4_address == ipv4_address)) {
+ my_index = i;
+ break;
+ }
+ }
+
+ if ((PREDICT_FALSE(my_index >= pm_len)) ||
+ ((PREDICT_FALSE(ip_n_to_1)) && (PREDICT_TRUE(my_pm->private_ip_users_count >= ip_n_to_1)))) {
+ return (CNAT_NO_POOL_ANY);
+ }
+
+ break;
+
+ case PORT_ALLOC_DIRECTED:
+ if (PREDICT_FALSE(*index > pm_len)) {
+ return (CNAT_INV_PORT_DIRECT);
+ }
+
+ my_index = *index;
+ my_pm = pm + my_index;
+ if (PREDICT_FALSE(my_pm->ipv4_address != ipv4_address)) {
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ PLATFORM_DEBUG_PRINT("Delete all main db entry for that particular in ipv4 address\n");
+ }
+ return (CNAT_INV_PORT_DIRECT);
+ }
+
+ break;
+
+ default:
+ msg_spp_err("bad allocation type in cnat_port_alloc");
+ return (CNAT_ERR_PARSER);
+ }
+
+
+ if (PREDICT_FALSE(my_pm == NULL)) {
+ return (CNAT_NO_POOL_ANY);
+ }
+
+ /*
+ * Check if the port is already allocated to some other mapping
+ */
+ if (PREDICT_FALSE(clib_bitmap_get_no_check (my_pm->bm, bm_bit) == 0)) {
+ return (CNAT_NO_POOL_ANY);
+ }
+
+#if DEBUG > 1
+ PLATFORM_DEBUG_PRINT("ALLOC_PORT_V2: My_Instance_Number %d: IP addr 0x%x, Inuse %d\n",
+ my_instance_number, my_pm->ipv4_address, my_pm->inuse);
+#endif
+
+ /*
+ * Indicate that the port is already allocated
+ */
+ cgn_clib_bitmap_clear_no_check (my_pm->bm, bm_bit);
+ (my_pm->inuse)++;
+
+ *index = my_index;
+
+ return (CNAT_SUCCESS);
+}
+
+void cnat_port_free_v2 (
+ cnat_portmap_v2_t *pm,
+ int index,
+ port_pair_t pair_type,
+ u16 base_port,
+ u16 static_port_range)
+{
+ cnat_portmap_v2_t *my_pm;
+ uword bit;
+
+ /* check for valid portmap */
+ if (PREDICT_FALSE(index > vec_len(pm))) {
+ spp_printf(CNAT_INVALID_INDEX_TO_FREE_PORT, 0, 0);
+ return;
+ }
+
+ my_pm = pm + index;
+ bit = port2bit(base_port);
+
+#if DEBUG > 0
+ if(clib_bitmap_get_no_check(my_pm->bm, bit))
+ ASSERT(clib_bitmap_get_no_check(my_pm->bm, bit) == 0);
+#endif
+
+ cgn_clib_bitmap_set_no_check(my_pm->bm, bit);
+
+ my_pm->inuse -= 1;
+ if(base_port >= static_port_range) {
+ /* Clear the full flag. we can have a new dynamic session now */
+ my_pm->dyn_full = 0;
+ }
+
+ return;
+}
+
+void cnat_portmap_dump_v2 (cnat_portmap_v2_t *pm, u16 print_limit)
+{
+ int i;
+ u32 inuse =0;
+
+ ASSERT(pm);
+
+ for (i = 0; i < BITS_PER_INST; i++) {
+ if (PREDICT_FALSE(clib_bitmap_get_no_check (pm->bm, i) == 0)) {
+ if (PREDICT_TRUE(inuse++ < print_limit))
+ PLATFORM_DEBUG_PRINT(" %d", bit2port(i));
+ }
+ }
+ if (PREDICT_FALSE(inuse >= print_limit)) {
+ PLATFORM_DEBUG_PRINT("%d printed, print limit is %d\n",
+ inuse, print_limit);
+ }
+ PLATFORM_DEBUG_PRINT("\n");
+}
+
+
+/*
+ * cnat_ports_init
+ */
+clib_error_t *cnat_ports_init(vlib_main_t *vm)
+{
+ cnat_ports_main_t *mp = &cnat_ports_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+
+ /* suppress crypto-random port numbering */
+#ifdef SOON
+ if (spp_get_int_prop("no_crypto_random_ports") == 0)
+ crypto_random32(&seed);
+#endif
+
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(cnat_ports_init);
+
diff --git a/vnet/vnet/vcgn/cnat_ports.h b/vnet/vnet/vcgn/cnat_ports.h
new file mode 100644
index 00000000000..bc1fb0d24a8
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_ports.h
@@ -0,0 +1,208 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_ports.h - port database definitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_PORTS_H__
+#define __CNAT_PORTS_H__
+
+#include "platform_common.h"
+#include "cnat_bulk_port_defs.h"
+
+#define PORTS_PER_ADDR 65536
+
+#define CNAT_INSTS PLATFORM_CNAT_INSTS
+
+#define BITS_PER_INST (PORTS_PER_ADDR)
+
+/*
+ * Ensure that atleast few 4 bit ports are available for RTSP
+ * in case we want to map 4 digit inside ports to 4 digit outside ports
+ */
+#define MIN_STATIC_PORT_RANGE_FOR_RTSP (9900)
+
+extern u8 my_instance_number;
+
+/*
+ * Now it is a 1-to-1 mapping between bit and port values
+ */
+static inline u16 bit2port (u32 bit)
+{
+ return bit;
+}
+
+static inline uword port2bit (u16 port)
+{
+ return port;
+}
+
+/*
+ * Port bitmap structure
+ * THIS structure is not used to be REMOVED....
+ */
+
+
+typedef struct {
+ u32 ipv4_address; /* native bit order */
+ u16 vrf;
+ u16 pad;
+ u32 threshold_crossed;
+ uword bm[(BITS_PER_INST + BITS(uword)-1)/BITS(uword)];
+} cnat_portmap_t;
+
+//cnat_portmap_t *cnat_portmap;
+
+
+typedef struct {
+ u32 inuse;
+ u32 delete_time;
+ u32 ipv4_address; /* native bit order */
+ u32 last_sent_timestamp;
+ uword bm[(BITS_PER_INST + BITS(uword)-1)/BITS(uword)];
+ u32 dyn_full;
+ u32 private_ip_users_count; /* number of private ip's(subscribers) to this
+ public ip */
+} cnat_portmap_v2_t;
+
+
+typedef enum {
+ PORT_SINGLE=0,
+ PORT_PAIR=1,
+ PORT_S_EVEN=2,
+ PORT_S_ODD=3,
+} port_pair_t;
+
+typedef enum {
+ PORT_TYPE_DYNAMIC=0,
+ PORT_TYPE_STATIC=1,
+ PORT_TYPE_RTSP=2,
+} port_type_t;
+
+
+typedef enum {
+ PORT_ALLOC_ANY=1,
+ PORT_ALLOC_DIRECTED=2,
+} port_alloc_t;
+
+#define PORT_PROBE_LIMIT 20
+
+
+/*
+ * randq1
+ * Linear congruential random number generator with
+ * extensively studied properties. See Numerical Recipes in C
+ * 2nd Ed. page 284. Known to behave according to the test vector
+ * supplied in the text, on X86 and Octeon.
+ */
+static inline u32 randq1 (u32 prev)
+{
+ return (1664525L*prev + 1013904223L);
+}
+
+cnat_errno_t
+cnat_static_port_alloc_v2(
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 i_ipv4_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif /* NO_BULK_LOGGING */
+ , u16 ip_n_to_1
+ );
+
+cnat_errno_t
+cnat_mapped_static_port_alloc_v2 (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ u32 *index,
+ u32 ipv4_address,
+ u16 port
+#ifndef NO_BULK_LOGGING
+ , int *nfv9_log_req,
+ bulk_alloc_size_t bulk_size
+#endif
+ , u16 ip_n_to_1
+ );
+
+cnat_errno_t
+cnat_dynamic_port_alloc_v2(
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port,
+ u16 static_port_range
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u16 ip_n_to_1,
+ u32 *rseed_ip
+ );
+
+
+cnat_errno_t
+cnat_dynamic_port_alloc_rtsp (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u16 start_range,
+ u16 end_range,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port
+#ifndef NO_BULK_LOGGING
+ , bulk_alloc_size_t bulk_size,
+ int *nfv9_log_req
+#endif
+ , u32 *rseed_ip
+ );
+
+void cnat_port_free_v2(
+ cnat_portmap_v2_t *pm,
+ int index,
+ port_pair_t ptype,
+ u16 base_port,
+ u16 static_port_range);
+
+void cnat_portmap_dump_v2(cnat_portmap_v2_t *pm,
+ u16 print_limit);
+
+
+
+cnat_errno_t
+nat64_static_port_alloc (
+ cnat_portmap_v2_t *pm,
+ port_alloc_t atype,
+ port_pair_t pair_type,
+ u32 *i_ipv6_address,
+ u16 i_port,
+ u32 *index,
+ u32 *o_ipv4_address,
+ u16 *o_port);
+
+
+
+#endif /* __CNAT_PORTS_H__ */
diff --git a/vnet/vnet/vcgn/cnat_show.c b/vnet/vnet/vcgn/cnat_show.c
new file mode 100644
index 00000000000..70476193797
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_show.c
@@ -0,0 +1,807 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_show.c - translation database definitions
+ *
+ * Copyright (c) 2007-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_logging.h"
+#include "spp_ctx.h"
+#include "spp_timers.h"
+#include "platform_common.h"
+#include "cnat_syslog.h"
+#include "cnat_v4_pptp_alg.h"
+#include "platform_common.h"
+
+#ifndef TOBE_PORTED
+/* The following variable is in cnat_config_msg_handler.c which
+ * is to be ported later.. if required
+ */
+u32 total_address_pool_allocated = 0;
+#endif
+
+#ifndef NO_BULK_LOGGING
+#define CNAT_MY_VRFMAP_PRINT \
+PLATFORM_DEBUG_PRINT("i-uidx 0x%x o-uidx 0x%x i-vrfid 0x%x o-vrfid 0x%x\n" \
+ "status %d del time 0x%x tcp mss 0x%x pm list 0x%x\n" \
+ "bulk size %d\n" \
+ "ip n:1 %d\n" \
+ "NFV9 template index 0x%x\n" \
+ "SYSLOG template index 0x%x\n" \
+ "Netflow Session Logging %d \n" \
+ "Syslog Session Logging %d \n" \
+ "PCP Server 0x%x, Port %u \n", \
+ my_vrfmap->i_vrf, my_vrfmap->o_vrf, my_vrfmap->i_vrf_id, \
+ my_vrfmap->o_vrf_id, my_vrfmap->status, my_vrfmap->delete_time, \
+ my_vrfmap->tcp_mss, my_vrfmap->portmap_list, \
+ BULKSIZE_FROM_VRFMAP(my_vrfmap), \
+ my_vrfmap->ip_n_to_1, \
+ my_vrfmap->nfv9_logging_index, \
+ my_vrfmap->syslog_logging_index,\
+ my_vrfmap->nf_logging_policy, \
+ my_vrfmap->syslog_logging_policy, \
+ my_vrfmap->pcp_server_addr, \
+ my_vrfmap->pcp_server_port);
+#else
+#define CNAT_MY_VRFMAP_PRINT \
+PLATFORM_DEBUG_PRINT("i-uidx 0x%x o-uidx 0x%x i-vrfid 0x%x o-vrfid 0x%x\n" \
+ "status %d del time 0x%x tcp mss 0x%x pm list 0x%x\n" \
+ "NFV9 template index 0x%x\n ip n:1 %d\n", \
+ my_vrfmap->i_vrf, my_vrfmap->o_vrf, my_vrfmap->i_vrf_id, \
+ my_vrfmap->o_vrf_id, my_vrfmap->status, my_vrfmap->delete_time, \
+ my_vrfmap->tcp_mss, my_vrfmap->portmap_list, \
+ my_vrfmap->nfv9_logging_index, my_vrfmap->ip_n_to_1);
+#endif /* NO_BULK_LOGGING */
+
+#define CNAT_MY_LOGGING_INFO_PRINT \
+do { \
+ cnat_syslog_logging_info_t *my_syslog_info = 0; \
+ PLATFORM_DEBUG_PRINT("SYSLOG config: \n"); \
+ pool_foreach (my_syslog_info, cnat_syslog_logging_info_pool, ({ \
+ if (my_syslog_info->i_vrf == my_vrfmap->i_vrf) { \
+ PLATFORM_DEBUG_PRINT(" \
+ ipv4[0x%x], port[%u], hostname[%s]\n", \
+ my_syslog_info->ipv4_address, my_syslog_info->port, \
+ my_syslog_info->header_hostname); \
+ break; \
+ } \
+ })); \
+}while (0) \
+;
+
+
+void printf_ipv4(u32 ad)
+{
+ u8 a __attribute__((unused)), b __attribute__((unused)),
+ c __attribute__((unused)), d __attribute__((unused));
+
+ a = ad>>24;
+ b = (ad>>16) & 0xFF;
+ c = (ad>>8) & 0xFF;
+ d = (ad>>0) & 0xFF;
+
+ PLATFORM_DEBUG_PRINT("%d.%d.%d.%d", a, b, c, d);
+}
+void cnat_main_db_entry_dump (cnat_main_db_entry_t *db)
+{
+ PLATFORM_DEBUG_PRINT("Main DB entry at %p, index %ld dst_ip %x\n",
+ db, db - cnat_main_db, db->dst_ipv4);
+ /* only dump hash next index if it's non EMPTY */
+ if (db->out2in_hash.next != EMPTY || db->in2out_hash.next != EMPTY)
+ PLATFORM_DEBUG_PRINT("out2in hash %u, in2out hash %u\n",
+ db->out2in_hash.next,
+ db->in2out_hash.next);
+ PLATFORM_DEBUG_PRINT("out2in key ipv4 0x%08X, port 0x%04X (%5d), vrf %d, protocol %s\n",
+ db->out2in_key.k.ipv4,
+ db->out2in_key.k.port,
+ db->out2in_key.k.port,
+ db->out2in_key.k.vrf & CNAT_VRF_MASK,
+ (db->out2in_key.k.vrf & CNAT_PRO_MASK) == CNAT_UDP ? "UDP" :
+ ((db->in2out_key.k.vrf & CNAT_PRO_MASK) == CNAT_TCP ? "TCP" :
+ ((db->in2out_key.k.vrf & CNAT_PRO_MASK) == CNAT_ICMP ? "ICMP" : "PPTP ALG")));
+
+ PLATFORM_DEBUG_PRINT("in2out key ipv4 0x%08X, port 0x%04X (%5d), vrf %d, protocol %s\n",
+ db->in2out_key.k.ipv4,
+ db->in2out_key.k.port,
+ db->in2out_key.k.port,
+ db->in2out_key.k.vrf & CNAT_VRF_MASK,
+ (db->in2out_key.k.vrf & CNAT_PRO_MASK) == CNAT_UDP ? "UDP" :
+ ((db->in2out_key.k.vrf & CNAT_PRO_MASK) == CNAT_TCP ? "TCP" :
+ ((db->in2out_key.k.vrf & CNAT_PRO_MASK) == CNAT_ICMP ? "ICMP" : "UNKNOWN")));
+
+ PLATFORM_DEBUG_PRINT("user %d, user ports (nxt) %d (prev) %d, vrfmap_index 0x%x\n",
+ db->user_index, db->user_ports.next, db->user_ports.prev,
+ db->vrfmap_index);
+ PLATFORM_DEBUG_PRINT("timeout %d \n", db->timeout);
+ PLATFORM_DEBUG_PRINT("flags 0x%x ", db->flags);
+
+ if (db->flags & CNAT_DB_FLAG_TCP_ACTIVE) {
+ PLATFORM_DEBUG_PRINT(" TCP_ACTIVE ");
+ } else if (db->flags & CNAT_DB_FLAG_UDP_ACTIVE) {
+ PLATFORM_DEBUG_PRINT(" UDP_ACTIVE ");
+ } else if (db->flags & CNAT_DB_FLAG_STATIC_PORT) {
+ PLATFORM_DEBUG_PRINT(" STATIC_PORT ");
+ }
+
+ PLATFORM_DEBUG_PRINT(" ALG dlt0 0x%02X dlt1 0x%02X\n", db->alg.alg_dlt[0], db->alg.alg_dlt[1]);
+ PLATFORM_DEBUG_PRINT("\n");
+
+ PLATFORM_DEBUG_PRINT("out2in_pkts: %u\n", db->out2in_pkts);
+ PLATFORM_DEBUG_PRINT("in2out_pkts: %u\n", db->in2out_pkts);
+ PLATFORM_DEBUG_PRINT("entry_expires: %u current time: %u\n", db->entry_expires, cnat_current_time);
+ PLATFORM_DEBUG_PRINT("-------------------------\n");
+}
+
+void cnat_user_db_entry_dump (cnat_user_db_entry_t *up)
+{
+ u32 db_entry_index, first_db_entry_index;
+ cnat_main_db_entry_t *ep;
+
+ PLATFORM_DEBUG_PRINT("User DB entry at %p, index %ld\n",
+ up, up - cnat_user_db);
+ PLATFORM_DEBUG_PRINT("translation list head index %u, %u translations portmapindex 0x%x\n",
+ up->translation_list_head_index,
+ up->ntranslations, up->portmap_index);
+ PLATFORM_DEBUG_PRINT("source ipv4 0x%x, source port 0x%x, vrf %d\n",
+ up->key.k.ipv4,
+ up->key.k.port,
+ up->key.k.vrf);
+ first_db_entry_index = db_entry_index = up->translation_list_head_index;
+ if (first_db_entry_index != EMPTY) {
+ PLATFORM_DEBUG_PRINT("Port translation list:\n");
+ do {
+ PLATFORM_DEBUG_PRINT(" [%d]\n", db_entry_index);
+ ep = cnat_main_db + db_entry_index;
+ db_entry_index = ep->user_ports.next;
+ } while (first_db_entry_index != db_entry_index);
+ } else {
+ PLATFORM_DEBUG_PRINT("WARNING: empty translation list!\n");
+ }
+ PLATFORM_DEBUG_PRINT("-------------------------\n");
+}
+
+void cnat_user_db_entry_dump_summary (cnat_user_db_entry_t *up)
+{
+ u32 db_entry_index, first_db_entry_index;
+ u32 total_entries = 0;
+
+ PLATFORM_DEBUG_PRINT("User DB entry at %p, index %ld\n",
+ up, up - cnat_user_db);
+ PLATFORM_DEBUG_PRINT("translation list head index %u, %u translations portmapindex 0x%x\n",
+ up->translation_list_head_index,
+ up->ntranslations, up->portmap_index);
+ PLATFORM_DEBUG_PRINT("source ipv4 0x%x, source port 0x%x, vrf %d\n",
+ up->key.k.ipv4,
+ up->key.k.port,
+ up->key.k.vrf);
+ first_db_entry_index = db_entry_index = up->translation_list_head_index;
+ if (first_db_entry_index != EMPTY) {
+ PLATFORM_DEBUG_PRINT("Port translation list:\n");
+ do {
+ total_entries++;
+ } while (first_db_entry_index != db_entry_index);
+ PLATFORM_DEBUG_PRINT("TOTAL_ENTRIES: %d\n", total_entries);
+ } else {
+ PLATFORM_DEBUG_PRINT("WARNING: empty translation list!\n");
+ }
+ PLATFORM_DEBUG_PRINT("-------------------------\n");
+}
+
+/* for internal development and UT only */
+void cnat_db_dump_main_by_index (int argc, unsigned long *argv)
+{
+ u32 index, i, len;
+ u32 active_count, scan_count;
+
+ if (argc != 1) {
+ PLATFORM_DEBUG_PRINT("invalid input %d\n", argc);
+ return;
+ }
+
+ index = argv[0];
+
+ len = vec_len(cnat_main_db);
+
+ active_count = pool_elts(cnat_main_db);
+
+ if (index >= active_count) {
+ PLATFORM_DEBUG_PRINT("Index %u >= total active entries %u\n", index, active_count);
+ return;
+ }
+
+ scan_count = 0;
+ for (i=0; i< len; i++) {
+ if(pool_is_free_index(cnat_main_db, i)) continue;
+
+ if (index == scan_count) {
+ cnat_main_db_entry_dump(cnat_main_db + i);
+ break;
+ }
+ scan_count++;
+ }
+}
+
+void cnat_db_dump_main (int argc, unsigned long *argv)
+{
+ cnat_main_db_entry_t *db;
+
+ pool_foreach(db, cnat_main_db, ({
+ cnat_main_db_entry_dump(db);
+ }));
+}
+
+void cnat_db_dump_main_summary (int argc, unsigned long *argv)
+{
+ cnat_main_db_entry_t *db;
+ u32 num_entries = 0;
+
+ pool_foreach(db, cnat_main_db, ({
+ num_entries++;
+ }));
+
+ PLATFORM_DEBUG_PRINT("\nNum main entries %d\n", num_entries);
+}
+
+void cnat_db_dump_user (int argc, unsigned long *argv)
+{
+ cnat_user_db_entry_t *up;
+
+ pool_foreach(up, cnat_user_db, ({
+ cnat_user_db_entry_dump(up);
+ }));
+}
+
+void cnat_db_dump_user_summary (int argc, unsigned long *argv)
+{
+ cnat_user_db_entry_t *up;
+
+ pool_foreach(up, cnat_user_db, ({
+ cnat_user_db_entry_dump_summary(up);
+ }));
+}
+
+void cnat_db_dump_hashes (int argc, unsigned long *argv)
+{
+ int i;
+
+ PLATFORM_DEBUG_PRINT("Main DB out2in hash:\n");
+ for (i = 0; i < vec_len(cnat_out2in_hash); i++) {
+ if (cnat_out2in_hash[i].next != EMPTY) {
+ PLATFORM_DEBUG_PRINT("[%d]: %u\n", i, cnat_out2in_hash[i].next);
+ }
+ }
+ PLATFORM_DEBUG_PRINT("Main DB in2out hash:\n");
+ for (i = 0; i < vec_len(cnat_in2out_hash); i++) {
+ if (cnat_in2out_hash[i].next != EMPTY) {
+ PLATFORM_DEBUG_PRINT("[%d]: %u\n", i, cnat_in2out_hash[i].next);
+ }
+ }
+
+ PLATFORM_DEBUG_PRINT("User hash:\n");
+ for (i = 0; i < vec_len(cnat_user_hash); i++) {
+ if (cnat_user_hash[i].next != EMPTY) {
+ PLATFORM_DEBUG_PRINT("[%d]: %u\n", i, cnat_user_hash[i].next);
+ }
+ }
+ PLATFORM_DEBUG_PRINT("-------------------------\n");
+}
+
+
+#ifdef OLD_VRFMAP
+
+void cnat_db_dump_cdb (int argc, unsigned long *argv)
+{
+ int k;
+ int verbose=0;
+ int all = 0;
+
+ if (argc > 0) {
+ verbose = 1;
+ }
+
+ if (argc > 1) {
+ all = 1;
+ }
+
+ PLATFORM_DEBUG_PRINT ("%d vrfmap vectors \n", vec_len(cnat_portmap_by_vrf));
+
+ for (k = 0; k < vec_len(cnat_portmap_by_vrf); k++) {
+ PLATFORM_DEBUG_PRINT("index%d: status %d i_vrf 0x%x o_vrf 0x%x\n", k,
+ cnat_portmap_by_vrf[k].status,
+ cnat_portmap_by_vrf[k].i_vrf,
+ cnat_portmap_by_vrf[k].o_vrf);
+ cnat_db_dump_address_portmap(verbose, all,
+ cnat_portmaps[k],
+ cnat_portmaps_inuse[k]);
+ }
+}
+
+void cnat_db_dump_i_vrf (int argc, unsigned long *argv)
+{
+ u32 k;
+ u32 vrf =0;
+ int verbose=0;
+ int all = 0;
+
+ if (!argc) {
+ PLATFORM_DEBUG_PRINT("need vrf input ,return\n");
+ return;
+ }
+
+ if (argc > 0) {
+ vrf = argv[0];
+ }
+
+ if (argc > 1) {
+ verbose = 1;
+ }
+
+ if (argc > 2) {
+ all = 1;
+ }
+
+ PLATFORM_DEBUG_PRINT ("%d vrfmap vectors \n", vec_len(cnat_portmap_by_vrf));
+
+ for (k = 0; k < vec_len(cnat_portmap_by_vrf); k++) {
+ if (cnat_portmap_by_vrf[k].i_vrf == vrf) {
+ PLATFORM_DEBUG_PRINT("%d: i_vrf 0x%x o_vrf 0x%x\n", k,
+ cnat_portmap_by_vrf[k].i_vrf,
+ cnat_portmap_by_vrf[k].o_vrf);
+ cnat_db_dump_address_portmap(verbose, all,
+ cnat_portmaps[k],
+ cnat_portmaps_inuse[k]);
+ return;
+ }
+ }
+ PLATFORM_DEBUG_PRINT("not found\n");
+}
+
+void cnat_db_dump_o_vrf (int argc, unsigned long *argv)
+{
+ u32 k;
+ int verbose=0;
+ int all = 0;
+ u32 vrf =0;
+
+ if (!argc) {
+ PLATFORM_DEBUG_PRINT("need vrf input ,return\n");
+ return;
+ }
+
+ if (argc > 0) {
+ vrf = argv[0];
+ }
+
+ if (argc > 1) {
+ verbose = 1;
+ }
+
+ if (argc > 2) {
+ all = 1;
+ }
+
+ PLATFORM_DEBUG_PRINT ("%d vrfmap vectors \n", vec_len(cnat_portmap_by_vrf));
+
+ for (k = 0; k < vec_len(cnat_portmap_by_vrf); k++) {
+ if (cnat_portmap_by_vrf[k].o_vrf == vrf) {
+ PLATFORM_DEBUG_PRINT("index%d: status %d i_vrf 0x%x o_vrf 0x%x\n", k,
+ cnat_portmap_by_vrf[k].status,
+ cnat_portmap_by_vrf[k].i_vrf,
+ cnat_portmap_by_vrf[k].o_vrf);
+ cnat_db_dump_address_portmap(verbose, all,
+ cnat_portmaps[k],
+ cnat_portmaps_inuse[k]);
+ return;
+ }
+ }
+ PLATFORM_DEBUG_PRINT("not found\n");
+}
+#endif
+
+#ifdef TOBE_PORTED
+/* This does not seem to be used */
+void cnat_db_mem_usage_cmd (int argc, unsigned long *argv)
+{
+ pool_header_t * p;
+ _VEC *_v;
+ u32 bitmap_bytes=0, free_indices_bytes=0, vec_bytes=0, total_bytes=0;
+
+ if (cnat_main_db) {
+ p = pool_header(cnat_main_db);
+ if (p->free_bitmap) {
+ _v = _vec_find(p->free_bitmap);
+ bitmap_bytes = _v->alen;
+ } else {
+ bitmap_bytes = 0;
+ }
+ if (p->free_indices) {
+ _v = _vec_find(p->free_indices);
+ free_indices_bytes = _v->alen;
+ } else {
+ free_indices_bytes = 0;
+ }
+ _v = _vec_find(cnat_main_db);
+ vec_bytes = _v->alen;
+ } else {
+ vec_bytes = 0;
+ }
+
+ total_bytes = bitmap_bytes + free_indices_bytes + vec_bytes;
+
+ PLATFORM_DEBUG_PRINT ("Main DB: %d total bytes, %d bitmap, %d indices, %d vec\n",
+ total_bytes, bitmap_bytes, free_indices_bytes, vec_bytes);
+ PLATFORM_DEBUG_PRINT (" vector length %d\n", vec_len(cnat_main_db));
+
+ if (cnat_user_db) {
+ p = pool_header(cnat_user_db);
+ if (p->free_bitmap) {
+ _v = _vec_find(p->free_bitmap);
+ bitmap_bytes = _v->alen;
+ } else {
+ bitmap_bytes = 0;
+ }
+ if (p->free_indices) {
+ _v = _vec_find(p->free_indices);
+ free_indices_bytes = _v->alen;
+ } else {
+ free_indices_bytes = 0;
+ }
+ _v = _vec_find(cnat_user_db);
+ vec_bytes = _v->alen;
+ } else {
+ vec_bytes = 0;
+ }
+
+ total_bytes = bitmap_bytes + free_indices_bytes + vec_bytes;
+
+ PLATFORM_DEBUG_PRINT ("User DB: %d total bytes, %d bitmap, %d indices, %d vec\n",
+ total_bytes, bitmap_bytes, free_indices_bytes, vec_bytes);
+ PLATFORM_DEBUG_PRINT (" vector length %d\n", vec_len(cnat_user_db));
+
+ _v = _vec_find(cnat_out2in_hash);
+ PLATFORM_DEBUG_PRINT("out2in hash: %d total bytes\n", _v->alen);
+
+ _v = _vec_find(cnat_in2out_hash);
+ PLATFORM_DEBUG_PRINT("in2out hash: %d total bytes\n", _v->alen);
+}
+#endif
+
+static void print_server_ip_address (vlib_main_t *vm, u32 ip)
+{
+ unsigned char bytes[4];
+ bytes[0] = ip & 0xFF;
+ bytes[1] = (ip >> 8) & 0xFF;
+ bytes[2] = (ip >> 16) & 0xFF;
+ bytes[3] = (ip >> 24) & 0xFF;
+ vlib_cli_output(vm, "\tIP Address %d.%d.%d.%d\n", bytes[0], bytes[1], bytes[2], bytes[3]);
+}
+
+void cnat_nfv9_show_collector (vlib_main_t *vm, cnat_nfv9_logging_info_t *my_nfv9_logging_info)
+{
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ my_nfv9_logging_info->server_index;
+
+ vlib_cli_output(vm,"\tVRF - 0x%x - %s\n", my_nfv9_logging_info->i_vrf,
+ my_nfv9_logging_info->deleted?"DELETED":"ACTIVE");
+ print_server_ip_address(vm, clib_net_to_host_u32(server->ipv4_address));
+ vlib_cli_output(vm,"\tIP port %d\n", server->port);
+ vlib_cli_output(vm,"\tTimeout %d\n", server->timeout_rate);
+ vlib_cli_output(vm,"\tRefresh %d\n", server->refresh_rate);
+ vlib_cli_output(vm,"\tMax PkSz %d\n", my_nfv9_logging_info->max_length_minus_max_record_size);
+}
+
+void cnat_db_dump_policy (int argc, unsigned long *argv)
+{
+
+ PLATFORM_CNAT_DB_DUMP_POLICY_PRINT();
+
+ if (cnat_nfv9_global_info.cnat_nfv9_init_done) {
+ if (cnat_nfv9_global_info.cnat_nfv9_global_collector_index != EMPTY) {
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info;
+ nfv9_server_info_t *server __attribute__((unused));
+
+ my_nfv9_logging_info = cnat_nfv9_logging_info_pool +
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+ server = nfv9_server_info_pool +
+ my_nfv9_logging_info->server_index;
+
+ PLATFORM_DEBUG_PRINT("NFv9 logging ip 0x%x port 0x%x refresh-rate %d timeout %d\n",
+ server->ipv4_address,
+ server->port,
+ server->refresh_rate,
+ server->timeout_rate);
+ PLATFORM_DEBUG_PRINT("NFv9 path_mtu = %d\n",
+ my_nfv9_logging_info->max_length_minus_max_record_size);
+ } else {
+ PLATFORM_DEBUG_PRINT("NFv9 global logging is not configured\n");
+ }
+ } else {
+ PLATFORM_DEBUG_PRINT("NFv9 LOGGING is not configured\n");
+ }
+
+}
+
+#ifdef OLD_VRFMAP
+void cnat_show_cdb (int verbose)
+{
+ int k, l, i;
+ for (i = 0; i < vec_len(cnat_portmap_by_vrf); i++) {
+ PLATFORM_DEBUG_PRINT("i_vrf %d : o_vrf %d\n",
+ cnat_portmap_by_vrf[i].i_vrf,
+ cnat_portmap_by_vrf[i].o_vrf);
+ }
+
+ PLATFORM_DEBUG_PRINT("port limit %d\n", cnat_main_db_max_ports_per_user);
+
+ PLATFORM_DEBUG_PRINT ("%d portmap vectors\n", vec_len(cnat_portmaps));
+
+ for (k = 0; k < vec_len(cnat_portmaps); k++) {
+ cnat_portmap_t *pm;
+ u16 *inuse;
+ pm = cnat_portmaps[k];
+ inuse = cnat_portmaps_inuse[k];
+ for (l = 0; l < vec_len(pm); l++) {
+ if (inuse[l] || verbose ) {
+ u32 net_address;
+ net_address =
+ spp_host_to_net_byte_order_32((pm+l)->ipv4_address);
+ printf_ipv4(net_address);
+ PLATFORM_DEBUG_PRINT (": %d inuse\n", inuse[l]);
+ if (verbose && inuse[l]) {
+ cnat_portmap_dump (pm+l, inuse+l);
+ }
+ }
+ }
+ }
+}
+#endif
+
+
+
+/* v2 show command */
+void cnat_show_address_portmap_sumary (cnat_portmap_v2_t *pm)
+{
+ cnat_portmap_v2_t *my_pm =0;
+ u32 first_address = 0;
+ u32 second_address = 0;
+ u32 last_address = 0;
+ u32 i, pm_len;
+
+ if ((pm_len = vec_len(pm))) {
+ PLATFORM_DEBUG_PRINT("%d portmap in this list 0x%lx\n",
+ pm_len, (u32)pm);
+ for (i = 0; i < pm_len; i++) {
+ my_pm = pm + i;
+ if (!first_address) {
+ first_address = my_pm->ipv4_address;
+ } else if (!second_address) {
+ second_address = my_pm->ipv4_address;
+ }
+ last_address = my_pm->ipv4_address;
+ }
+
+ if (first_address) {
+ PLATFORM_DEBUG_PRINT("1. 0x%08x", first_address);
+ }
+ if (second_address) {
+ PLATFORM_DEBUG_PRINT(", 2. 0x%08x", second_address);
+ }
+
+ if ((last_address != first_address) &&
+ (last_address != second_address)) {
+ PLATFORM_DEBUG_PRINT(", ....., %d. 0x%08x", pm_len, last_address);
+ }
+ PLATFORM_DEBUG_PRINT("\n");
+ } else {
+ PLATFORM_DEBUG_PRINT("ZERO POOL ADDRESSES in this list 0x%x \n", (u32)pm);
+ }
+}
+
+
+void cnat_show_address_portmap (int verbose, int all,
+ cnat_portmap_v2_t *pm, u16 port_limit)
+{
+ cnat_portmap_v2_t *my_pm =0;
+ u32 i, pm_len;
+
+ pm_len = vec_len(pm);
+ if (!all) {
+ cnat_show_address_portmap_sumary(pm);
+ } else {
+ PLATFORM_DEBUG_PRINT("%d portmap in this list 0x%x \n", pm_len, (u32)pm);
+ }
+
+ for (i = 0; i < pm_len; i++) {
+
+ my_pm = pm + i;
+ if (all) {
+ PLATFORM_DEBUG_PRINT("pm:0x%x ip address:0x%x del_time 0x%x inuse:%d\n",
+ (u32)my_pm, my_pm->ipv4_address, my_pm->delete_time, my_pm->inuse);
+ } else if (my_pm->inuse) {
+ PLATFORM_DEBUG_PRINT("pm:0x%x ip address:0x%x inuse:%d\n",
+ (u32)my_pm, my_pm->ipv4_address, my_pm->inuse);
+ }
+
+ if (verbose && (my_pm->inuse)) {
+ if(PREDICT_FALSE(!port_limit)) {
+ cnat_portmap_dump_v2 (my_pm, cnat_main_db_max_ports_per_user);
+ }
+ else {
+ cnat_portmap_dump_v2 (my_pm, port_limit);
+ }
+ }
+ }
+
+ PLATFORM_DEBUG_PRINT("\n");
+}
+
+
+void cnat_show_cdb_v2 (int verbose, int all)
+{
+ cnat_vrfmap_t *my_vrfmap = 0;
+ cnat_portmap_v2_t *pm =0;
+ PLATFORM_DEBUG_PRINT("port limit %d\n", cnat_main_db_max_ports_per_user);
+ PLATFORM_DEBUG_PRINT("total address pool allocated %d\n", total_address_pool_allocated);
+ PLATFORM_DEBUG_PRINT("icmp rate limit %d (per core %d)\n",
+ cnat_main_db_icmp_rate_limit, cnat_main_db_icmp_rate_limit_core);
+ PLATFORM_DEBUG_PRINT("dynamic port range start %d\n", cnat_static_port_range);
+ if (pptp_cfg.enable == PPTP_DISABLED) {
+ PLATFORM_DEBUG_PRINT("PPTP alg disabled \n");
+ } else {
+ PLATFORM_DEBUG_PRINT("PPTP alg enabled \n");
+ }
+
+ if (ftp_alg_enabled) {
+ PLATFORM_DEBUG_PRINT("FTP alg enabled\n");
+ } else {
+ PLATFORM_DEBUG_PRINT("FTP alg disabled\n");
+ }
+
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ CNAT_MY_VRFMAP_PRINT
+ CNAT_MY_LOGGING_INFO_PRINT
+ PLATFORM_DEBUG_PRINT("per vrf port limit %d\n", my_vrfmap->port_limit);
+ pm = my_vrfmap->portmap_list;
+ cnat_show_address_portmap(verbose, all, pm, my_vrfmap->port_limit);
+
+ }));
+}
+
+
+void cnat_show_cdb_command_v2(int argc, unsigned long *argv)
+{
+ int verbose=0;
+ int all = 0;
+
+ if (argc > 0) {
+ verbose = 1;
+ }
+
+ if (argc > 1) {
+ all = 1;
+ }
+
+ cnat_show_cdb_v2(verbose, all);
+}
+
+void cnat_show_ivrf_command_v2 (int argc, unsigned long *argv)
+{
+ u32 vrf =0;
+ int verbose=0;
+ int all = 0;
+ cnat_vrfmap_t *my_vrfmap = 0;
+ cnat_portmap_v2_t *pm =0;
+
+ if (!argc) {
+ PLATFORM_DEBUG_PRINT("need vrf input ,return\n");
+ return;
+ }
+ if (argc > 0) {
+ vrf = argv[0];
+ }
+ if (argc > 1) {
+ verbose = 1;
+ }
+ if (argc > 2) {
+ all = 1;
+ }
+ PLATFORM_DEBUG_PRINT ("%lld vrfmap vectors \n", pool_elts(cnat_map_by_vrf));
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->i_vrf == vrf) {
+ CNAT_MY_VRFMAP_PRINT
+ pm = my_vrfmap->portmap_list;
+ cnat_show_address_portmap(verbose, all, pm,my_vrfmap->port_limit);
+ return;
+ }
+ }));
+ PLATFORM_DEBUG_PRINT("not found\n");
+}
+
+void cnat_show_ovrf_command_v2 (int argc, unsigned long *argv)
+{
+ u32 not_found =1;
+ u32 vrf =0;
+ int verbose=0;
+ int all = 0;
+ cnat_vrfmap_t *my_vrfmap = 0;
+ cnat_portmap_v2_t *pm =0;
+
+ if (!argc) {
+ PLATFORM_DEBUG_PRINT("need vrf input ,return\n");
+ return;
+ }
+ if (argc > 0) {
+ vrf = argv[0];
+ }
+ if (argc > 1) {
+ verbose = 1;
+ }
+ if (argc > 2) {
+ all = 1;
+ }
+ PLATFORM_DEBUG_PRINT("%d vrfmap vectors \n", pool_elts(cnat_map_by_vrf));
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->o_vrf == vrf) {
+ CNAT_MY_VRFMAP_PRINT
+ pm = my_vrfmap->portmap_list;
+ cnat_show_address_portmap(verbose, all, pm,my_vrfmap->port_limit);
+ not_found = 0;
+ }
+ }));
+ if (not_found) {
+ PLATFORM_DEBUG_PRINT("not found\n");
+ }
+}
+
+void cnat_timeout_db_entry_dump (cnat_timeout_db_entry_t *up)
+{
+ u32 db_entry_index __attribute__((unused)),
+ first_db_entry_index __attribute__((unused));
+
+ PLATFORM_DEBUG_PRINT("Timeout DB entry at index %ld\n", up - cnat_timeout_db);
+ PLATFORM_DEBUG_PRINT("Desnt key 0x%16llx\n", up->t_key.timeout_key.key64);
+ PLATFORM_DEBUG_PRINT("Timeout value %d\n", up->t_key.timeout_value);
+ PLATFORM_DEBUG_PRINT("Hash Next 0x%x\n", up->t_hash.next);
+
+}
+
+void cnat_db_dump_timeout ()
+{
+ cnat_timeout_db_entry_t *up;
+ pool_header_t *h;
+ u32 used __attribute__((unused)), free __attribute__((unused));
+
+ h = pool_header(cnat_timeout_db);
+ free = vec_len(h->free_indices);
+ used = (vec_len(cnat_timeout_db) - free);
+
+ PLATFORM_DEBUG_PRINT("Timeout DB Free %d, Used %d\n",free, used);
+
+ pool_foreach(up, cnat_timeout_db, ({
+ cnat_timeout_db_entry_dump(up);
+ }));
+}
+
diff --git a/vnet/vnet/vcgn/cnat_show_api.h b/vnet/vnet/vcgn/cnat_show_api.h
new file mode 100644
index 00000000000..5904c7e2dd6
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_show_api.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CNAT_SHOW_API_H__
+#define __CNAT_SHOW_API_H__
+
+typedef struct _spp_api_cnat_v4_show_inside_entry_req {
+ u16 _spp_msg_id;
+ u16 vrf_id;
+ u32 ipv4_addr;
+ u16 start_port;
+ u16 end_port;
+ u8 flags;
+ u8 all_entries;
+ u8 protocol;
+} spp_api_cnat_v4_show_inside_entry_req_t;
+
+typedef struct _spp_api_cnat_v4_show_outside_entry_req {
+ u16 _spp_msg_id;
+ u16 vrf_id;
+ u32 ipv4_addr;
+ u16 start_port;
+ u16 end_port;
+ u8 flags;
+ u8 protocol;
+} spp_api_cnat_v4_show_outside_entry_req_t;
+
+
+#endif
diff --git a/vnet/vnet/vcgn/cnat_show_response.h b/vnet/vnet/vcgn/cnat_show_response.h
new file mode 100644
index 00000000000..bec1bd97245
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_show_response.h
@@ -0,0 +1,580 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_show_response.h show command response structs
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_SHOW_RESPONSE_H__
+#define __CNAT_SHOW_RESPONSE_H__
+
+/*
+ * Flags indicating the type of translation entry
+ */
+#define CNAT_TRANSLATION_ENTRY_ALL 0x0
+#define CNAT_TRANSLATION_ENTRY_STATIC 0x1
+#define CNAT_TRANSLATION_ENTRY_ALG 0x2
+#define CNAT_TRANSLATION_ENTRY_DYNAMIC 0x4
+
+/* for PCP support */
+#define CNAT_TRANSLATION_ENTRY_PCPI_DYNAMIC 0x08
+#define CNAT_TRANSLATION_ENTRY_PCPE_DYNAMIC 0x10
+
+#define MAX_NODE_NAME_LEN 18
+#define MAX_CTR_NAME_LEN 10
+
+/*
+ * show translation entry response structures
+ */
+typedef struct {
+ u16 call_id;
+ u16 cnat_call_id; /* mapped call Id */
+ u16 dst_call_id; /* dest call id */
+} cnat_v4_show_gre_entry;
+
+typedef struct {
+ u16 msg_id;
+ u16 rc; /* o/p parameter. */
+ u16 num_entries; /* Number of entries sent as output */
+ u16 vrf_id; /* vrf id */
+ u32 pns_ip;
+ cnat_v4_show_gre_entry entries[0];
+} cnat_v4_show_gre_entry_resp;
+
+/*
+ * show translation entry response structures
+ */
+typedef struct {
+ u32 ipv4_addr;
+ u16 src_port;
+ u16 cnat_port; /* port which replaced the src port */
+ u8 protocol;
+ u8 pad;
+ u16 flags;
+ u16 nsessions;
+ u32 in2out_packets;
+ u32 out2in_packets;
+} cnat_v4_show_translation_entry;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc; /* o/p parameter. */
+ u8 num_entries; /* Number of entries sent as output */
+ u16 vrf_id; /* vrf id */
+ u16 pad;
+ cnat_v4_show_translation_entry entries[0];
+} cnat_v4_show_entry_resp;
+
+/*
+ * show free/used ipv4 address/port response structure
+ */
+typedef struct {
+ u32 ip_addr;
+ u32 free_used_ports;
+} cnat_v4_show_free_used_entry;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 count;
+ u32 max_ports;
+ cnat_v4_show_free_used_entry entry_list[0];
+} cnat_v4_show_free_used_entry_resp;
+
+/*
+ * Node name to id mapping
+ */
+typedef struct {
+ u8 node_id;
+ u8 pad;
+ char node_name[MAX_NODE_NAME_LEN];
+} cnat_statistics_node_name;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 num_nodes;
+ cnat_statistics_node_name node_name_array[0];
+} cnat_statistics_node_name_mapping_resp;
+
+/*
+ * Counter name to id mapping
+ */
+typedef struct {
+ u8 node_id;
+ u8 counter_id;
+ char counter_name[MAX_CTR_NAME_LEN];
+} cnat_statistics_counter_name;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 num_counters;
+ cnat_statistics_counter_name counter_name_array[0];
+} cnat_statistics_counter_name_mapping_resp;
+
+
+/*
+ * Node name to id mapping
+ */
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 num_nodes;
+ u32 pad;
+ u64 counters [0];
+} cnat_statistics_counter_values;
+
+/*
+ * Summary Stats
+ */
+typedef struct {
+ u32 eaddr;
+ u32 ports_used;
+} pool_address_usage_t;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad;
+ u16 max_pkt_size;
+ u16 pool_address_copied;
+ u32 active_translations;
+ u32 translation_create_rate;
+ u32 translation_delete_rate;
+ u32 in2out_forwarding_rate;
+ u32 out2in_forwarding_rate;
+ u32 dummy;
+ u64 in2out_drops_port_limit_exceeded;
+ u64 in2out_drops_system_limit_reached;
+ u64 in2out_drops_resource_depletion;
+ u64 no_translation_entry_drops;
+ u64 pptp_active_tunnels;
+ u64 pptp_active_channels;
+ u64 pptp_ctrlmsg_drops;
+ u32 no_sessions;
+
+ u32 pool_address_totally_free;
+ u32 pool_address_used; /* The following array size will be lesser of
+ (pool_address_used, 200) */
+ u32 num_subscribers;
+ u64 drops_sessiondb_limit_exceeded;
+ u64 in2out_drops_src_ip_no_config; // for deterministic nat on brahmos
+ pool_address_usage_t pool_address_usage[0];
+} cnat_show_statistics_summary_resp;
+
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad;
+ u64 o2i_tcp_seq_mismatch_drop;
+ u64 o2i_tcp_seq_mismatch;
+ u64 o2i_sessions_created;
+ u64 o2i_end_point_filter_drop;
+} cnat_show_counters_summary_resp;
+
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad;
+
+ /*
+ * XLAT statistics
+ */
+ u64 v6_to_v4_tcp_input_count;
+ u64 v6_to_v4_tcp_non_translatable_drop_count;
+ u64 v6_to_v4_tcp_invalid_next_hdr_drop_count;
+ u64 v6_to_v4_tcp_no_db_drop_count;
+ u64 v6_to_v4_tcp_output_count;
+
+ u64 v4_to_v6_tcp_input_count;
+ u64 v4_to_v6_tcp_no_db_drop_count;
+ u64 v4_to_v6_tcp_output_count;
+
+ u64 v6_to_v4_udp_input_count;
+ u64 v6_to_v4_udp_non_translatable_drop_count;
+ u64 v6_to_v4_udp_invalid_next_hdr_drop_count;
+ u64 v6_to_v4_udp_no_db_drop_count;
+ u64 v6_to_v4_udp_output_count;
+
+ u64 v4_to_v6_udp_input_count;
+ u64 v4_to_v6_udp_no_db_drop_count;
+ u64 v4_to_v6_udp_output_count;
+ u64 v4_to_v6_udp_frag_crc_zero_drop_count;
+ u64 v4_to_v6_udp_crc_zero_recycle_sent_count;
+ u64 v4_to_v6_udp_crc_zero_recycle_drop_count;
+
+ u64 v6_to_v4_icmp_qry_input_count;
+ u64 v6_to_v4_icmp_no_db_drop_count;
+ u64 v6_to_v4_icmp_frag_drop_count;
+ u64 v6_to_v4_icmp_invalid_next_hdr_drop_count;
+ u64 v6_to_v4_icmp_non_translatable_drop_count;
+ u64 v6_to_v4_icmp_non_translatable_fwd_count;
+ u64 v6_to_v4_icmp_unsupported_type_drop_count;
+ u64 v6_to_v4_icmp_err_output_count;
+ u64 v6_to_v4_icmp_qry_output_count;
+
+ u64 v4_to_v6_icmp_qry_input_count;
+ u64 v4_to_v6_icmp_no_db_drop_count;
+ u64 v4_to_v6_icmp_frag_drop_count;
+ u64 v4_to_v6_icmp_unsupported_type_drop_count;
+ u64 v4_to_v6_icmp_err_output_count;
+ u64 v4_to_v6_icmp_qry_output_count;
+
+ u64 v6_to_v4_subsequent_frag_input_count;
+ u64 v6_to_v4_subsequent_frag_non_translatable_drop_count;
+ u64 v6_to_v4_subsequent_frag_invalid_next_hdr_drop_count;
+ u64 v6_to_v4_subsequent_frag_no_db_drop_count;
+ u64 v6_to_v4_subsequent_frag_output_count;
+
+ u64 v4_to_v6_subsequent_frag_input_count;
+ u64 v4_to_v6_subsequent_frag_no_db_drop_count;
+ u64 v4_to_v6_subsequent_frag_output_count;
+
+ u64 v4_to_v6_subsequent_frag_drop_count;
+ u64 v4_to_v6_subsequent_frag_throttled_count;
+ u64 v4_to_v6_subsequent_frag_timeout_drop_count;
+ u64 v4_to_v6_subsequent_frag_tcp_input_count;
+ u64 v4_to_v6_subsequent_frag_udp_input_count;
+ u64 v4_to_v6_subsequent_frag_icmp_input_count;
+
+ u64 v6_to_v4_options_input_count;
+ u64 v6_to_v4_options_drop_count;
+ u64 v6_to_v4_options_forward_count;
+ u64 v6_to_v4_options_no_db_drop_count;
+ u64 v6_to_v4_unsupp_proto_count;
+
+ u64 v4_to_v6_options_input_count;
+ u64 v4_to_v6_options_drop_count;
+ u64 v4_to_v6_options_forward_count;
+ u64 v4_to_v6_options_no_db_drop_count;
+ u64 v4_to_v6_unsupp_proto_count;
+
+ u64 v4_icmp_gen_count;
+ u64 v6_icmp_gen_count;
+} xlat_show_statistics_summary_resp;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad;
+ /* Total v4 packets to BR */
+ u64 v4_to_v6_input_total_count;
+ /* Total v4 tunneled packets to BR */
+ u64 v4_to_v6_41_input_total_count;
+ /* proto 41 packets without minimum, of 40, v6 payload */
+ u64 v4_to_v6_41_insuff_v6payld_count;
+ /* total proto 41 packets being considered for decap */
+ u64 v4_to_v6_41_valid_count;
+ /* proto 41 packets that failed security check*/
+ u64 v4_to_v6_41_sec_check_fail_count;
+ /* packets with no active db entry */
+ u64 v4_to_v6_no_db_drop_count;
+ /* proto 41 packets actually getting decapped */
+ u64 v4_to_v6_41_decap_count;
+ /* total v4 packets which are neither icmp nor 41 */
+ u64 v4_to_v6_unsupported_protocol_count;
+ /* v4 tunneled packets with invalid v6 source address */
+ u64 v4_to_v6_41_invalid_v6_source;
+ /* total icmpv4 packets destined to BR */
+ u64 v4_forus_icmp_input_count;
+ /* total icmpv4 echo replies by BR */
+ u64 v4_icmp_reply_count;
+ /* total icmpv4 error messages translated to icmpv6 by BR */
+ u64 v4_to_v6_icmp_translation_count;
+ /* total packets with icmpv4 type/code which are not supported by BR */
+ u64 v4_icmp_unsupported_count;
+ /* total icmpv4 packets which are rate-limited by BR */
+ u64 v4_icmp_throttled_count;
+ /* total ICMPv4 error messages which could not be translated */
+ u64 v4_icmp_non_translatable_drop_count;
+
+ /* ipv4 defrag stats */
+ u64 v4_to_v6_frag_input_count;
+ u64 v4_to_v6_frag_sec_check_fail_count;
+ u64 v4_to_v6_frag_reassem_count;
+ u64 v4_to_v6_frag_timeout_drop_count;
+ u64 v4_to_v6_frag_icmp_input_count;
+ u64 v4_to_v6_frag_41_insuff_v6payld_count;
+ u64 v4_to_v6_frag_no_db_drop_count;
+ u64 v4_to_v6_frag_unsupported_protocol_count;
+ u64 v4_to_v6_frag_41_invalid_v6_source;
+ u64 v4_to_v6_frag_throttled_count;
+ u64 v4_to_v6_frag_dup_count;
+ u64 v4_to_v6_frag_reassem_frag_count;
+ u64 v4_to_v6_frag_disable_count;
+ u64 v4_to_v6_frag_drop_count;
+
+ /* total v6 packets input to BR */
+ u64 v6_to_v4_total_input_count;
+ /* v6 packets with no active db entry */
+ u64 v6_to_v4_no_db_drop_count;
+ /* forus v6 packets with next header other than icmpv6 */
+ u64 v6_to_v4_forus_unsupp_proto_count;
+ /* total v6 packets that got tunneled */
+ u64 v6_to_v4_encap_count;
+ /* total icmpv6 packets destined to BR */
+ u64 v6_forus_icmp_input_count;
+ /* total icmpv6 echo replies by BR */
+ u64 v6_icmp_reply_count;
+ /* total icmpv6 PTB messages generated by BR */
+ u64 v6_ptb_generated_count;
+ /* total ipv6 packets for which PTBv6 was NOT generated by BR */
+ u64 v6_ptb_not_generated_drop_count;
+ /* total icmpv6 Neighbor Advertisements generated by BR */
+ u64 v6_na_generated_count;
+ /* total icmpv6 TTL expiry messages generated by BR */
+ u64 v6_ttl_expiry_generated_count;
+ /* total ICMPv6 fragments, which are dropped by BR */
+ u64 v6_to_v4_frag_icmp_input_count;
+ /* total packets with icmpv6 type/code which are not supported by BR */
+ u64 v6_icmp_unsupported_count;
+ /* total icmpv6 packets which are rate-limited by BR */
+ u64 v6_icmp_throttled_count;
+} v6rd_show_statistics_summary_resp;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad;
+
+ /* Total Incoming Count */
+ u64 v4_input_count;
+ /* Total Drop Count */
+ u64 v4_drop_count;
+ /* Total Output Count */
+ u64 v4_to_v6_output_count;
+ /* TCP Incoming Count */
+ u64 v4_tcp_input_count;
+ /* TCP Output Count */
+ u64 v4_tcp_output_count;
+ /* UDP Incoming Count */
+ u64 v4_udp_input_count;
+ /* UDP Output Count */
+ u64 v4_udp_output_count;
+ /* ICMPv4 Incoming Count */
+ u64 v4_icmp_input_count;
+ /* ICMPv4 Output Count */
+ u64 v4_to_v6_icmp_output_count;
+ /* Invalid UIDB Drop Count */
+ u64 v4_invalid_uidb_drop_count;
+ /* NoDb Drop Count */
+ u64 v4_no_db_drop_count;
+ /* TTL Expire Drop Count */
+ u64 v4_ttl_expire_drop_count;
+ /* Invalid IP Destination Drop Count */
+ u64 v4_invalid_destination_prefix_drop_count;
+ /* Packet Execeeding Path MTU Drop Count */
+ u64 v4_path_mtu_exceed_count;
+ /* Unsupported Protocol Drop Count */
+ u64 v4_invalid_proto_type_drop_count;
+ /* ICMPv4 Generated for TTL Expire Count */
+ u64 v4_ttl_expiry_generated_count;
+ /* ICMPv4 Generated for Error Count */
+ u64 v4_icmp_error_gen_count;
+ /* ICMPv4 Packets Rate-Limited Count */
+ u64 v4_icmp_throttled_drop_count;
+ /* TCP MSS Changed Count */
+ u64 v4_tcp_mss_changed_count;
+
+ /* Total Incoming Count */
+ u64 v6_input_count;
+ /* Total Drop Count */
+ u64 v6_drop_count;
+ /* Total Output Count */
+ u64 v6_to_v4_output_count;
+ /* TCP Incoming Count */
+ u64 v6_tcp_input_count;
+ /* TCP Output Count */
+ u64 v6_tcp_output_count;
+ /* UDP Incoming Count */
+ u64 v6_udp_input_count;
+ /* UDP Output Count */
+ u64 v6_udp_output_count;
+ /* ICMPv4 Incoming Count */
+ u64 v6_icmpv4_input_count;
+ /* ICMPv4 Output Count */
+ u64 v6_icmpv4_output_count;
+ /* Invalid UIDB Drop Count */
+ u64 v6_invalid_uidb_drop_count;
+ /* NoDb Drop Count */
+ u64 v6_no_db_drop_count;
+ /* TTL Expire Drop Count */
+ u64 v6_ttl_expire_drop_count;
+ /* Invalid IPv6 Destination Drop Count */
+ u64 v6_invalid_destination_drop_count;
+ /* Invalid Source Prefix Drop Count */
+ u64 v6_invalid_source_prefix_drop_count;
+ /* Unsupported Protocol Drop Count */
+ u64 v6_invalid_proto_type_drop_count;
+ /* ICMPv6 Input Count */
+ u64 v6_icmp_input_count;
+ /* ICMPv6 Invalid UIDB Drop Count */
+ u64 v6_icmp_invalid_uidb_drop_count;
+ /* ICMPv6 NoDb Drop Count */
+ u64 v6_icmp_no_db_drop_count;
+ /* ICMPv6 TTL Expire Drop Count */
+ u64 v6_icmp_ttl_expire_drop_count;
+ /* ICMPv6 Invalid IPv6 Destination Drop Count */
+ u64 v6_icmp_invalid_destination_drop_count;
+ /* ICMPv6 Unsupported Type Drop Count */
+ u64 v6_icmp_unsupported_type_drop_count;
+ /* ICMPv6 Invalid NxtHdr Drop Count*/
+ u64 v6_icmp_unsupported_nxthdr_drop_count;
+ /* ICMPv6 Frag Drop Count */
+ u64 v6_icmp_frag_drop_count;
+ /* ICMPv6 Forus Count */
+ u64 v6_forus_icmp_input_count;
+ /* ICMPv6 Echo Response Received Count */
+ u64 v6_received_echo_response_count;
+ /* ICMPv6 Echo Replies Count */
+ u64 v6_echo_reply_count;
+ /* ICMPv6 Translated to ICMPV4 Output Count*/
+ u64 v6_to_v4_icmp_output_count;
+ /* ICMPv6 Generated for TTL Expire Count */
+ u64 v6_ttl_expiry_generated_count;
+ /* ICMPv6 Generated for Error Count */
+ u64 v6_icmp_error_gen_count;
+ /* ICMPv6 Packets Rate-Limited Count */
+ u64 v6_icmp_throttled_drop_count;
+ /* TCP MSS Changed Count */
+ u64 v6_tcp_mss_changed_count;
+
+ /*Total Input Count*/
+ u64 v4_to_v6_frag_input_count;
+ /*Total Drop Count*/
+ u64 v4_to_v6_frag_drop_count;
+ /*Reassembled Output Count*/
+ u64 v4_to_v6_frag_reassem_count;
+
+ /*TCP Input Count*/
+ u64 v4_to_v6_frag_tcp_input_count;
+ /*UDP Input Count*/
+ u64 v4_to_v6_frag_udp_input_count;
+ /*ICMPv4 Input Count*/
+ u64 v4_to_v6_frag_icmp_input_count;
+
+ /*Invalid UIDB Drop Count */
+ u64 v4_to_v6_frag_invalid_uidb_drop_count;
+ /*NoDb Drop Count*/
+ u64 v4_to_v6_frag_no_db_drop_count;
+ /*Unsupported Protocol Drop Count*/
+ u64 v4_to_v6_frag_invalid_proto_type_drop_count;
+ /*Throttled Count*/
+ u64 v4_to_v6_frag_throttled_count;
+ /*Timeout Drop Count*/
+ u64 v4_to_v6_frag_timeout_drop_count;
+ /*Duplicates Drop Count*/
+ u64 v4_to_v6_frag_dup_count;
+
+ /*Total Input Count*/
+ u64 v6_to_v4_inner_frag_input_count;
+ /*Total Drop Count*/
+ u64 v6_to_v4_inner_frag_drop_count;
+ /*Total Output Count*/
+ u64 v6_to_v4_inner_frag_output_count;
+
+ /*TCP Input Count*/
+ u64 v6_to_v4_inner_frag_tcp_input_count;
+ /*UDP Input Count*/
+ u64 v6_to_v4_inner_frag_udp_input_count;
+ /*ICMPv4 Input Count*/
+ u64 v6_to_v4_inner_frag_icmp_input_count;
+
+ /*Invalid Source Prefix Drop Count*/
+ u64 v6_to_v4_inner_frag_invalid_source_prefix_drop_count;
+ /*Unsupported Protocol Drop Count*/
+ u64 v6_to_v4_inner_frag_invalid_proto_type_drop_count;
+ /*Throttled Count*/
+ u64 v6_to_v4_inner_frag_throttled_count;
+ /*Timeout Drop Count*/
+ u64 v6_to_v4_inner_frag_timeout_drop_count;
+ /*Duplicates Drop Count*/
+ u64 v6_to_v4_inner_frag_dup_count;
+
+ /*ICMPv6 Generated for Error Count */
+ u64 v6_to_v4_inner_frag_icmp_error_gen_count;
+ /*ICMPv6 Packets Rate-Limited Count */
+ u64 v6_to_v4_inner_frag_icmp_throttled_drop_count;
+
+ /*TCP MSS Changed Count */
+ u64 v6_to_v4_inner_frag_tcp_mss_changed_count;
+
+} mape_show_statistics_summary_resp;
+
+/*
+ * The following are the command types for Generic Command cases
+ */
+#define CNAT_DEBUG_GENERIC_COMMAND_READ_MEM 1
+#define CNAT_DEBUG_GENERIC_COMMAND_WRITE_MEM 2
+#define CNAT_DEBUG_GENERIC_COMMAND_DB_SUMMARY 3
+#define CNAT_DEBUG_GENERIC_COMMAND_USER_DB_PM 4
+#define CNAT_DEBUG_GET_CGN_DB_SUMMARY 5
+
+typedef enum {
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_POLICY,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_MAIN_DB,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_USER_DB,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_HASHES_DB,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_VRF_MAP,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_SUMMARY_DB,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_STATS,
+ CNAT_DEBUG_GENERIC_COMMAND_CLEAR_STATS,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_NODE_COUNTER,
+ CNAT_DEBUG_GENERIC_COMMAND_CLEAR_NODE_COUNTER,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_CNAT_COUNTER,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_VA,
+ CNAT_DEBUG_GENERIC_COMMAND_SHOW_CONFIG,
+ CNAT_DEBUG_GENERIC_COMMAND_SHOW_NFV9,
+ CNAT_DEBUG_GENERIC_COMMAND_SHOW_IVRF,
+ CNAT_DEBUG_GENERIC_COMMAND_SHOW_OVRF,
+ CNAT_DEBUG_SPP_LOG,
+ CNAT_DEBUG_GENERIC_COMMAND_DEBUG_OPTIONS,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_DEBUG_LEVELS,
+ CNAT_DEBUG_GENERIC_COMMAND_DEBUG_FLAGS,
+ CNAT_READ_TEMP_SENSORS,
+ CNAT_BLOCK_OCTEON_SENSOR_READ,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_MAIN_DB_SUMMARY,
+ CNAT_DEBUG_GENERIC_COMMAND_DUMP_USER_DB_SUMMARY,
+ CNAT_DEBUG_DUMP_6RD_STATS,
+ CNAT_DEBUG_TIMEOUT_DB_SUMMARY,
+ CNAT_NAT64_STFUL_DEBUG_COMMAND,
+ CNAT_DEBUG_SET_BULK_SIZE,
+ CNAT_DEBUG_SHOW_BULK_STAT,
+ CNAT_DEBUG_CLEAR_BULK_STAT,
+ CNAT_DEBUG_SHOW_BULK_ALLOC,
+ CNAT_DEBUG_NAT64,
+ CNAT_DEBUG_NAT44_IN2OUT_FRAG_STATS,
+} cnat_debug_dump_type_t;
+
+typedef enum {
+ CNAT_DEBUG_FLAG_UDP_INSIDE_CHECKSUM_MODIFY,
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_CHECKSUM_MODIFY,
+ CNAT_DEBUG_FLAG_UDP_INSIDE_PACKET_DUMP,
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_PACKET_DUMP,
+} cnat_debug_flag_type_t;
+
+typedef struct {
+ u16 spp_msg_id;
+ u8 rc;
+ u8 core;
+ u32 num_bytes;
+ u8 raw_data[0];
+} cnat_generic_command_resp;
+
+extern u32 db_free_entry (void * p);
+#endif /*__CNAT_SHOW_RESPONSE_H__*/
diff --git a/vnet/vnet/vcgn/cnat_syslog.c b/vnet/vnet/vcgn/cnat_syslog.c
new file mode 100644
index 00000000000..9c69d4a260d
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_syslog.c
@@ -0,0 +1,1787 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_syslog.c
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <arpa/inet.h>
+#include "cnat_syslog.h"
+#include "platform_common.h"
+#include "cnat_db.h"
+#include "cnat_log_common.h"
+#include <vppinfra/pool.h>
+
+#define SYSLOG_DELIMITER ' '
+#define SYSLOG_FIELD_ABSENT '-'
+/* #define SHOW_SYSLOG_TIMESTAMP 1 TO DO. Remove this later */
+/*
+ * Defining the below macro here for now. Assumption is, syslog packets
+ * are sent out via same channel as that of NFV9.
+ * Has to be overridden if this assumption is false.
+ */
+#define PLATFORM_SYSLOG_DISP_NODE_IDX PLATFORM_NFV9_DISP_NODE_IDX
+
+cnat_syslog_global_info_t cnat_syslog_global_info;
+cnat_syslog_logging_info_t *cnat_syslog_logging_info_pool;
+cnat_syslog_global_counters_t cnat_syslog_global_counter;
+extern u32 syslog_debug_flag;
+
+#define CNAT_SYSLOG_DEBUG_CODE 2
+
+#if CNAT_SYSLOG_DEBUG_CODE > 3
+#define SYSLOG_COND if(my_instance_number == 0)
+
+#define SYSLOG_DEBUG_PRINTF1(a) SYSLOG_COND printf(a);
+#define SYSLOG_DEBUG_PRINTF2(a, b) SYSLOG_COND printf(a, b);
+#define SYSLOG_DEBUG_PRINTF3(a, b, c) SYSLOG_COND printf(a, b, c);
+#define SYSLOG_DEBUG_PRINTF4(a, b, c, d) SYSLOG_COND printf(a, b, c, d);
+
+#else
+
+#define SYSLOG_DEBUG_PRINTF1(a)
+#define SYSLOG_DEBUG_PRINTF2(a, b)
+#define SYSLOG_DEBUG_PRINTF3(a, b, c)
+#define SYSLOG_DEBUG_PRINTF4(a, b, c, d)
+
+#endif
+
+
+void syslog_params_show(u32 logging_index)
+{
+ cnat_syslog_logging_info_t *log_info __attribute__((unused));
+ if(logging_index == EMPTY) {
+ PLATFORM_DEBUG_PRINT("\nSyslog logging not configured\n");
+ return;
+ }
+
+ log_info = cnat_syslog_logging_info_pool + logging_index;
+
+ PLATFORM_DEBUG_PRINT("\nSyslog parameters --\n");
+ PLATFORM_DEBUG_PRINT("IPV4 address: %x, port %d, max log size %d\n",
+ log_info->ipv4_address,
+ log_info->port, log_info->max_length_minus_max_record_size);
+ PLATFORM_DEBUG_PRINT("Host name: %s, priority %d",
+ log_info->header_hostname, log_info->header_priority);
+
+}
+
+/* Util function to copy a number as ASCII in to a buf in a
+ * faster way (should be faster than sprintf)
+ */
+
+const unsigned char ascii_numbers[][3] =
+ { {'0', '0', '0'},
+ {'1', '0', '0'},
+ {'2', '0', '0'},
+ {'3', '0', '0'},
+ {'4', '0', '0'},
+ {'5', '0', '0'},
+ {'6', '0', '0'},
+ {'7', '0', '0'},
+ {'8', '0', '0'},
+ {'9', '0', '0'},
+ {'1', '0', '0'},
+ {'1', '1', '0'},
+ {'1', '2', '0'},
+ {'1', '3', '0'},
+ {'1', '4', '0'},
+ {'1', '5', '0'},
+ {'1', '6', '0'},
+ {'1', '7', '0'},
+ {'1', '8', '0'},
+ {'1', '9', '0'},
+ {'2', '0', '0'},
+ {'2', '1', '0'},
+ {'2', '2', '0'},
+ {'2', '3', '0'},
+ {'2', '4', '0'},
+ {'2', '5', '0'},
+ {'2', '6', '0'},
+ {'2', '7', '0'},
+ {'2', '8', '0'},
+ {'2', '9', '0'},
+ {'3', '0', '0'},
+ {'3', '1', '0'},
+ {'3', '2', '0'},
+ {'3', '3', '0'},
+ {'3', '4', '0'},
+ {'3', '5', '0'},
+ {'3', '6', '0'},
+ {'3', '7', '0'},
+ {'3', '8', '0'},
+ {'3', '9', '0'},
+ {'4', '0', '0'},
+ {'4', '1', '0'},
+ {'4', '2', '0'},
+ {'4', '3', '0'},
+ {'4', '4', '0'},
+ {'4', '5', '0'},
+ {'4', '6', '0'},
+ {'4', '7', '0'},
+ {'4', '8', '0'},
+ {'4', '9', '0'},
+ {'5', '0', '0'},
+ {'5', '1', '0'},
+ {'5', '2', '0'},
+ {'5', '3', '0'},
+ {'5', '4', '0'},
+ {'5', '5', '0'},
+ {'5', '6', '0'},
+ {'5', '7', '0'},
+ {'5', '8', '0'},
+ {'5', '9', '0'},
+ {'6', '0', '0'},
+ {'6', '1', '0'},
+ {'6', '2', '0'},
+ {'6', '3', '0'},
+ {'6', '4', '0'},
+ {'6', '5', '0'},
+ {'6', '6', '0'},
+ {'6', '7', '0'},
+ {'6', '8', '0'},
+ {'6', '9', '0'},
+ {'7', '0', '0'},
+ {'7', '1', '0'},
+ {'7', '2', '0'},
+ {'7', '3', '0'},
+ {'7', '4', '0'},
+ {'7', '5', '0'},
+ {'7', '6', '0'},
+ {'7', '7', '0'},
+ {'7', '8', '0'},
+ {'7', '9', '0'},
+ {'8', '0', '0'},
+ {'8', '1', '0'},
+ {'8', '2', '0'},
+ {'8', '3', '0'},
+ {'8', '4', '0'},
+ {'8', '5', '0'},
+ {'8', '6', '0'},
+ {'8', '7', '0'},
+ {'8', '8', '0'},
+ {'8', '9', '0'},
+ {'9', '0', '0'},
+ {'9', '1', '0'},
+ {'9', '2', '0'},
+ {'9', '3', '0'},
+ {'9', '4', '0'},
+ {'9', '5', '0'},
+ {'9', '6', '0'},
+ {'9', '7', '0'},
+ {'9', '8', '0'},
+ {'9', '9', '0'},
+ {'1', '0', '0'},
+ {'1', '0', '1'},
+ {'1', '0', '2'},
+ {'1', '0', '3'},
+ {'1', '0', '4'},
+ {'1', '0', '5'},
+ {'1', '0', '6'},
+ {'1', '0', '7'},
+ {'1', '0', '8'},
+ {'1', '0', '9'},
+ {'1', '1', '0'},
+ {'1', '1', '1'},
+ {'1', '1', '2'},
+ {'1', '1', '3'},
+ {'1', '1', '4'},
+ {'1', '1', '5'},
+ {'1', '1', '6'},
+ {'1', '1', '7'},
+ {'1', '1', '8'},
+ {'1', '1', '9'},
+ {'1', '2', '0'},
+ {'1', '2', '1'},
+ {'1', '2', '2'},
+ {'1', '2', '3'},
+ {'1', '2', '4'},
+ {'1', '2', '5'},
+ {'1', '2', '6'},
+ {'1', '2', '7'},
+ {'1', '2', '8'},
+ {'1', '2', '9'},
+ {'1', '3', '0'},
+ {'1', '3', '1'},
+ {'1', '3', '2'},
+ {'1', '3', '3'},
+ {'1', '3', '4'},
+ {'1', '3', '5'},
+ {'1', '3', '6'},
+ {'1', '3', '7'},
+ {'1', '3', '8'},
+ {'1', '3', '9'},
+ {'1', '4', '0'},
+ {'1', '4', '1'},
+ {'1', '4', '2'},
+ {'1', '4', '3'},
+ {'1', '4', '4'},
+ {'1', '4', '5'},
+ {'1', '4', '6'},
+ {'1', '4', '7'},
+ {'1', '4', '8'},
+ {'1', '4', '9'},
+ {'1', '5', '0'},
+ {'1', '5', '1'},
+ {'1', '5', '2'},
+ {'1', '5', '3'},
+ {'1', '5', '4'},
+ {'1', '5', '5'},
+ {'1', '5', '6'},
+ {'1', '5', '7'},
+ {'1', '5', '8'},
+ {'1', '5', '9'},
+ {'1', '6', '0'},
+ {'1', '6', '1'},
+ {'1', '6', '2'},
+ {'1', '6', '3'},
+ {'1', '6', '4'},
+ {'1', '6', '5'},
+ {'1', '6', '6'},
+ {'1', '6', '7'},
+ {'1', '6', '8'},
+ {'1', '6', '9'},
+ {'1', '7', '0'},
+ {'1', '7', '1'},
+ {'1', '7', '2'},
+ {'1', '7', '3'},
+ {'1', '7', '4'},
+ {'1', '7', '5'},
+ {'1', '7', '6'},
+ {'1', '7', '7'},
+ {'1', '7', '8'},
+ {'1', '7', '9'},
+ {'1', '8', '0'},
+ {'1', '8', '1'},
+ {'1', '8', '2'},
+ {'1', '8', '3'},
+ {'1', '8', '4'},
+ {'1', '8', '5'},
+ {'1', '8', '6'},
+ {'1', '8', '7'},
+ {'1', '8', '8'},
+ {'1', '8', '9'},
+ {'1', '9', '0'},
+ {'1', '9', '1'},
+ {'1', '9', '2'},
+ {'1', '9', '3'},
+ {'1', '9', '4'},
+ {'1', '9', '5'},
+ {'1', '9', '6'},
+ {'1', '9', '7'},
+ {'1', '9', '8'},
+ {'1', '9', '9'},
+ {'2', '0', '0'},
+ {'2', '0', '1'},
+ {'2', '0', '2'},
+ {'2', '0', '3'},
+ {'2', '0', '4'},
+ {'2', '0', '5'},
+ {'2', '0', '6'},
+ {'2', '0', '7'},
+ {'2', '0', '8'},
+ {'2', '0', '9'},
+ {'2', '1', '0'},
+ {'2', '1', '1'},
+ {'2', '1', '2'},
+ {'2', '1', '3'},
+ {'2', '1', '4'},
+ {'2', '1', '5'},
+ {'2', '1', '6'},
+ {'2', '1', '7'},
+ {'2', '1', '8'},
+ {'2', '1', '9'},
+ {'2', '2', '0'},
+ {'2', '2', '1'},
+ {'2', '2', '2'},
+ {'2', '2', '3'},
+ {'2', '2', '4'},
+ {'2', '2', '5'},
+ {'2', '2', '6'},
+ {'2', '2', '7'},
+ {'2', '2', '8'},
+ {'2', '2', '9'},
+ {'2', '3', '0'},
+ {'2', '3', '1'},
+ {'2', '3', '2'},
+ {'2', '3', '3'},
+ {'2', '3', '4'},
+ {'2', '3', '5'},
+ {'2', '3', '6'},
+ {'2', '3', '7'},
+ {'2', '3', '8'},
+ {'2', '3', '9'},
+ {'2', '4', '0'},
+ {'2', '4', '1'},
+ {'2', '4', '2'},
+ {'2', '4', '3'},
+ {'2', '4', '4'},
+ {'2', '4', '5'},
+ {'2', '4', '6'},
+ {'2', '4', '7'},
+ {'2', '4', '8'},
+ {'2', '4', '9'},
+ {'2', '5', '0'},
+ {'2', '5', '1'},
+ {'2', '5', '2'},
+ {'2', '5', '3'},
+ {'2', '5', '4'},
+ {'2', '5', '5'}
+ };
+
+inline static int
+byte_to_ascii_decimal_unaligned(
+ unsigned char *ptr, unsigned char num)
+{
+ *ptr++ = ascii_numbers[num][0];
+ if(PREDICT_FALSE(num < 10)) {
+ return 1;
+ }
+ *ptr++ = ascii_numbers[num][1];
+ if(PREDICT_FALSE(num < 100)) {
+ return 2;
+ }
+ *ptr++ = ascii_numbers[num][2];
+ return 3;
+}
+
+/* Copies the dotted decimal format of ipv4
+ * in to the space provided and
+ * returns the number of bytes copied
+ */
+inline static int
+copy_ipv4_addr(unsigned char *ptr, u32 ipv4)
+{
+ unsigned char *temp = ptr;
+ temp += byte_to_ascii_decimal_unaligned(temp, (ipv4 >> 24));
+ *temp++ = '.';
+ temp += byte_to_ascii_decimal_unaligned(temp, ((ipv4 >> 16) & 0xFF));
+ *temp++ = '.';
+ temp += byte_to_ascii_decimal_unaligned(temp, ((ipv4 >> 8) & 0xFF));
+ *temp++ = '.';
+ temp += byte_to_ascii_decimal_unaligned(temp, (ipv4 & 0xFF));
+
+ return (temp - ptr);
+}
+
+#ifdef TOBE_PORTED
+/*
+ * edt: * * cnat_syslog_fill_ip_header
+ *
+ * Tries to fill the fields of the IP header before it
+ * is sent to the L3 infra node.
+ *
+ * Argument: cnat_syslog_logging_info_t *logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_syslog_fill_ip_header (cnat_syslog_logging_info_t *logging_info)
+{
+ spp_ctx_t *ctx;
+
+ /*
+ * Fill in the IP header and port number of the Netflow collector
+ * The L3 Infra node will fill in the rest of the fields
+ */
+ ctx = logging_info->current_logging_context;
+ fill_ip_n_udp_hdr(ctx, logging_info->ipv4_address,
+ logging_info->port, logging_info->pkt_length);
+
+}
+#else
+inline
+void cnat_syslog_fill_ip_header (cnat_syslog_logging_info_t *logging_info)
+{
+ return;
+}
+#endif
+
+#ifndef TOBE_PORTED
+void cnat_syslog_logging_init()
+{
+ return;
+}
+
+void cnat_syslog_log_mapping_create(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap)
+{
+ return;
+}
+
+void cnat_syslog_log_mapping_delete(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap)
+{
+ return;
+}
+
+void cnat_syslog_ds_lite_port_limit_exceeded(
+ dslite_key_t * key,
+ dslite_table_entry_t *dslite_entry)
+{
+ return;
+}
+
+void cnat_syslog_nat44_mapping_create(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t * sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+/* Following are in cnat_util.c which are not ported */
+/* This function is defined in cnat_util.c which need to be ported */
+cnat_icmp_msg_t icmp_msg_gen_allowed ()
+{
+ return 1;
+}
+
+void cnat_syslog_nat44_mapping_delete(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ return;
+}
+
+u32
+cnat_get_unix_time_in_seconds (void)
+{
+ return 0;
+}
+#else /* TOBE_PORTED */
+void
+cnat_syslog_dump_logging_context (u32 value1,
+ cnat_syslog_logging_info_t *logging_info,
+ u32 value2)
+{
+ u8 *pkt_ptr;
+ u32 i;
+
+ if (PREDICT_TRUE(syslog_debug_flag == 0)) {
+ return;
+ }
+ /*
+ * Reduce the logging to few cores, to enable easier debugging
+ */
+ if ((my_instance_number & 0x7) != 0) {
+ return;
+ }
+ printf("\nDumping %s packet at locn %d: time 0x%x",
+ (value2 == 1) ? "CURRENT" : "QUEUED",
+ value1,
+ cnat_get_unix_time_in_seconds());
+
+ printf("\ni_vrf 0x%x, ip_address 0x%x, port %d, pkt len %d",
+ 0 /* TO DP Add vrf like nfv9_logging_info->i_vrf */,
+ logging_info->ipv4_address,
+ logging_info->port,
+ logging_info->pkt_length);
+ printf("\n");
+
+ if (value2 == 1) {
+ pkt_ptr = logging_info->current_logging_context->packet_data;
+ } else {
+ pkt_ptr = logging_info->queued_logging_context->packet_data;
+ }
+
+ /*
+ * Dump along with 8 bytes of SHIM header
+ */
+ for (i = 0; i <
+ (logging_info->pkt_length + CNAT_NFV9_IP_HDR_OFFSET);
+ i = i + 1) {
+ u8 c1, c2, c3;
+
+ if (i == 0) {
+ printf("\nL2_HEADER + SHIM_HEADER: \n");
+ } else if (i == CNAT_NFV9_IP_HDR_OFFSET) {
+ printf("\nIP_HEADER: \n");
+ } else if (i == CNAT_NFV9_UDP_HDR_OFFSET) {
+ printf("\nUDP_HEADER: \n");
+ } else if (i == CNAT_NFV9_HDR_OFFSET) {
+ printf("\nSyslog content..\n");
+ while(i <
+ (logging_info->pkt_length + CNAT_NFV9_HDR_OFFSET)) {
+ printf("%c", (u8)(*(pkt_ptr + i)));
+ i++;
+ if((u8)(*(pkt_ptr + i)) == '[') /* new record begins */
+ printf("\n");
+ }
+ return;
+ }
+
+ c3 = *(pkt_ptr + i);
+ c2 = c3 & 0xf;
+ c1 = (c3 >> 4) & 0xf;
+
+ printf("%c%c ",
+ ((c1 <= 9) ? (c1 + '0') : (c1 - 10 + 'a')),
+ ((c2 <= 9) ? (c2 + '0') : (c2 - 10 + 'a')));
+
+ }
+
+ printf("\n");
+}
+
+
+/*
+ * edt: * * cnat_syslog_send_pkt
+ *
+ * Tries to send a logging pkt. If the packet cannot be sent
+ * because of rewrite_output node cannot process it, queue
+ * it temporarily and try to send it later.
+ *
+ * Argument: cnat_syslog_logging_info_t *logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_syslog_send_pkt (cnat_syslog_logging_info_t *logging_info)
+{
+ spp_node_t *output_node;
+
+ cnat_syslog_fill_ip_header(logging_info);
+
+ output_node = spp_get_nodes() +
+ cnat_syslog_global_info.cnat_syslog_disp_node_index;
+
+ cnat_syslog_dump_logging_context (2, logging_info, 1);
+
+ if (PREDICT_TRUE(output_node->sf.nused < SPP_MAXDISPATCH)) {
+ /*
+ * Move the logging context to output node
+ */
+ logging_info->current_logging_context->current_length =
+ logging_info->pkt_length;
+ PLATFORM_SET_CTX_RU_TX_FROM_NODE(logging_info->current_logging_context, \
+ NODE_LOGGING);
+ spp_dispatch_make_node_runnable(output_node);
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ logging_info->current_logging_context;
+
+ if(PREDICT_FALSE(syslog_debug_flag > 10))
+ printf("\nSyslog: 2. Sending Current packet\n");
+ } else {
+ /*
+ * Queue the context into the logging_info structure,
+ * We will try to send it later. Currently, we will
+ * restrict to only one context queued.
+ */
+ cnat_syslog_global_counter.downstream_constipation_count++;
+ if(PREDICT_FALSE(syslog_debug_flag > 10))
+ printf("\nSyslog: 2. Downstream congestion \n");
+
+ /*
+ * Attach the current logging context which is full to the
+ * queued context list in logging_info structure
+ */
+ logging_info->queued_logging_context =
+ logging_info->current_logging_context;
+ }
+
+ /*
+ * Whether the context is queued or not, set the current context index
+ * to EMPTY, as the earlier context can no more be used to send
+ * more logging records.
+ */
+ logging_info->current_logging_context = NULL;
+}
+
+
+/*
+ * edt: * * cnat_syslog_send_queued_pkt
+ *
+ * Tries to send a logging pkt that has been queued earlier
+ * because it could not be sent due to downstream constipation
+ *
+ * Argument: cnat_syslog_logging_info_t *logging_info
+ * structure that contains the packet context
+ */
+inline
+void cnat_syslog_send_queued_pkt (cnat_syslog_logging_info_t *logging_info)
+{
+ spp_node_t *output_node;
+
+ output_node = spp_get_nodes() +
+ cnat_syslog_global_info.cnat_syslog_disp_node_index;
+
+ cnat_syslog_dump_logging_context(1, logging_info, 2);
+
+ if(PREDICT_TRUE(output_node->sf.nused < SPP_MAXDISPATCH)) {
+ /*
+ * Move the logging context to output node
+ */
+ /** This looks like a bug to me .. need to confirm *****
+ logging_info->queued_logging_context->current_length =
+ nfv9_logging_info->pkt_length; ***/
+ PLATFORM_SET_CTX_RU_TX_FROM_NODE(logging_info->queued_logging_context,
+ NODE_LOGGING)
+ spp_dispatch_make_node_runnable(output_node);
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ logging_info->queued_logging_context;
+
+ SYSLOG_DEBUG_PRINTF1("\nSYSLOG: 1. Sending Queued packet\n")
+
+ /*
+ * Context has been queued, it will be freed after the pkt
+ * is sent. Clear this from the logging_context_info structure
+ */
+ logging_info->queued_logging_context = NULL;
+
+ } else {
+ cnat_syslog_global_counter.downstream_constipation_count++;
+ }
+}
+
+/*
+ * edt: * * handle_pending_syslog_pkts
+ *
+ * Timer handler for sending any pending syslog record
+ *
+ */
+inline
+void handle_pending_syslog_pkts()
+{
+ spp_node_t *output_node;
+ cnat_syslog_logging_info_t *my_logging_info = 0;
+ u32 current_timestamp = cnat_get_sys_up_time_in_ms();
+ i16 sf_nused;
+
+ output_node = spp_get_nodes() +
+ cnat_syslog_global_info.cnat_syslog_disp_node_index;
+
+ sf_nused = output_node->sf.nused;
+
+ pool_foreach (my_logging_info, cnat_syslog_logging_info_pool, ({
+ /*
+ * Check if no more logging contexts can be queued
+ */
+ if (PREDICT_FALSE(sf_nused >= SPP_MAXDISPATCH)) {
+ break;
+ }
+ if (my_logging_info->queued_logging_context)
+ cnat_syslog_send_queued_pkt (my_logging_info);
+
+ if(my_logging_info->current_logging_context &&
+ ((current_timestamp -
+ my_logging_info->current_logging_context_timestamp)
+ > 1000)) {
+ /*
+ * If there is a current logging context and timestamp
+ * indicates it is pending for long, send it out
+ * Also if there is a queued context send it out as well
+ */
+ SYSLOG_DEBUG_PRINTF4("\nLOG_TIMER: queued %p, curr %p, sf_nused %d",
+ my_logging_info->queued_logging_context,
+ my_logging_info->current_logging_context,
+ sf_nused);
+ cnat_syslog_send_pkt(my_logging_info);
+ }
+ }));
+}
+
+const unsigned char hex_numbers_single_digit[] =
+ { '0', '1', '2', '3', '4', '5', '6', '7', '8',
+ '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+inline static int u16_to_ascii_decimal_aligned(
+ unsigned char *ptr, u16 num, u16 min_digits)
+{
+ /* The logic below is replicated in
+ * function u16_to_ascii_decimal_unaligned
+ * except the use of min_digits
+ * Replication is done to optimize run time
+ * if you fix a bug here, check u16_to_ascii_decimal_unaligned
+ * as well (and vice versa)
+ */
+ unsigned char *temp = ptr;
+ int no_leading_zeros = 0;
+
+ if(num > 9999 || min_digits == 5) {
+ *temp++ = hex_numbers_single_digit[num/10000];
+ num = num%10000;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 999 || min_digits == 4) {
+ *temp++ = hex_numbers_single_digit[num/1000];
+ num = num%1000;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 99 || min_digits == 3) {
+ *temp++ = hex_numbers_single_digit[num/100];
+ num = num%100;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 9 || min_digits == 2) {
+ *temp++ = hex_numbers_single_digit[num/10];
+ num = num%10;
+ }
+
+ *temp++ = hex_numbers_single_digit[num];
+
+ return temp-ptr;
+}
+
+inline static int u16_to_ascii_decimal_unaligned(
+ unsigned char *ptr, u16 num)
+{
+ /*
+ * return u16_to_ascii_decimal_aligned(ptr, num, 0);
+ * should do the job.. however, to opimize the run time
+ * the code of u16_to_ascii_decimal_aligned is being
+ * repeated here without the use of min_digits
+ * if you fix a bug here, please check
+ * u16_to_ascii_decimal_aligned as well (and vice versa)
+ */
+ unsigned char *temp = ptr;
+ int no_leading_zeros = 0;
+
+ if(num > 9999) {
+ *temp++ = hex_numbers_single_digit[num/10000];
+ num = num%10000;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 999) {
+ *temp++ = hex_numbers_single_digit[num/1000];
+ num = num%1000;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 99) {
+ *temp++ = hex_numbers_single_digit[num/100];
+ num = num%100;
+ no_leading_zeros = 1;
+ }
+
+ if(no_leading_zeros || num > 9) {
+ *temp++ = hex_numbers_single_digit[num/10];
+ num = num%10;
+ }
+
+ *temp++ = hex_numbers_single_digit[num];
+
+ return temp-ptr;
+}
+
+static int syslog_get_timestamp(unsigned char *ts)
+{
+ static const char *months[] = {"Jan ", "Feb ", "Mar ", "Apr ", "May ",
+ "Jun ", "Jul ", "Aug ", "Sep ", "Oct ", "Nov ", "Dec " };
+
+ unsigned char *temp = ts;
+ /* Inserts time stamp in the syslog format and returns lenght
+ * assumes that ts has sufficient space
+ */
+ /* China Telecom has demanded that the time stamp has to be
+ * in the format '2011 Jun 7 12:34:08'
+ */
+ time_t time = (time_t)cnat_get_unix_time_in_seconds();
+ struct tm tm1;
+
+ gmtime_r(&time, &tm1);
+ /* Now put the pieces together */
+ /* Year */
+ ts += u16_to_ascii_decimal_unaligned(ts, (tm1.tm_year + 1900));
+ *ts++ = SYSLOG_DELIMITER;
+ /* Month */
+ memcpy(ts, months[tm1.tm_mon], 4);
+ ts += 4; /* DELIMITER taken care */
+ /* day */
+ ts += u16_to_ascii_decimal_unaligned(ts, tm1.tm_mday);
+ *ts++ = SYSLOG_DELIMITER;
+ /* hours */
+ ts += u16_to_ascii_decimal_aligned(ts, tm1.tm_hour, 2);
+ *ts++ = ':';
+ /* minutes */
+ ts += u16_to_ascii_decimal_aligned(ts, tm1.tm_min, 2);
+ *ts++ = ':';
+ /* seconds */
+ ts += u16_to_ascii_decimal_aligned(ts, tm1.tm_sec, 2);
+ return ts - temp;
+}
+
+/* Ensure that the order of the below array matches with
+ * syslog_service_type enum
+ */
+static char *syslog_service_string[] = { "NAT44", "DSLITE" };
+
+/* Ensure that the order of below array matches with
+ * syslog_event_type_t enum
+ */
+typedef struct {
+ char *event_name;
+ int name_length;
+} syslog_event_description_type;
+
+const static syslog_event_description_type sys_log_event[] = {
+ { "UserbasedA", 10 }, /* yes, 10 is strlen of "UserbasedA" */
+ { "UserbasedW", 10 },
+ { "SessionbasedA", 13 },
+ { "SessionbasedW", 13 },
+ { "SessionbasedAD", 14 },
+ { "SessionbasedWD", 14 },
+ { "Portblockrunout", 15 },
+ { "TCPseqmismatch", 14},
+ { "Invalid", 7 }
+};
+
+inline static int syslog_fill_header(const cnat_syslog_logging_info_t *log_info,
+ syslog_service_type_t s_type)
+{
+ /* Forms the syslog header and returns the lenght
+ * Assumes that header has sufficient space
+ */
+
+ /* Sample header (as agreed for China Telecom requirements --
+ * <134> 1 2011 May 31 10:30:45 192.168.2.3 - - NAT44 -
+ */
+
+ unsigned char *temp, *header;
+ int count;
+ temp = header = (unsigned char *)
+ &(log_info->current_logging_context->packet_data[CNAT_NFV9_HDR_OFFSET]);
+ *temp++ = '<';
+ temp += byte_to_ascii_decimal_unaligned(temp,
+ log_info->header_priority);
+ *temp++ = '>';
+ *temp++ = SYSLOG_DELIMITER;
+ *temp++ = '1'; /* Syslog version -- always set to 1 */
+ *temp++ = SYSLOG_DELIMITER;
+ temp += syslog_get_timestamp(temp);
+ *temp++ = SYSLOG_DELIMITER;
+ count = strlen(log_info->header_hostname);
+ memcpy(temp, log_info->header_hostname, count);
+ temp += count;
+ *temp++ = SYSLOG_DELIMITER;
+ *temp++ = SYSLOG_FIELD_ABSENT; /* App name - nil value */
+ *temp++ = SYSLOG_DELIMITER;
+ *temp++ = SYSLOG_FIELD_ABSENT; /* Proc ID - nil value for now */
+ *temp++ = SYSLOG_DELIMITER;
+ /* Now the msg id */
+ count = strlen(syslog_service_string[s_type]);
+ memcpy(temp, syslog_service_string[s_type], count);
+ temp += count;
+ *temp++ = SYSLOG_DELIMITER;
+ *temp++ = SYSLOG_FIELD_ABSENT; /* No structured elements */
+ *temp++ = SYSLOG_DELIMITER;
+#ifdef SHOW_SYSLOG_TIMESTAMP
+ printf("\nSysLog TS: %s : Length %d", header, temp - header);
+#endif /* SHOW_SYSLOG_TIMESTAMP */
+ return temp-header;
+}
+
+extern void cnat_logging_init();
+
+/* one time call at the beginning */
+void cnat_syslog_logging_init()
+{
+ if(PREDICT_TRUE(cnat_syslog_global_info.cnat_syslog_init_done))
+ return; /* Already done */
+
+ cnat_logging_init();
+ cnat_syslog_global_info.cnat_syslog_disp_node_index =
+ spp_lookup_node_index(PLATFORM_SYSLOG_DISP_NODE_IDX);
+ ASSERT(cnat_syslog_global_info.cnat_syslog_disp_node_index != (u16)~0);
+
+ cnat_syslog_global_info.cnat_syslog_init_done = 1;
+}
+
+/*
+ * edt: * * cnat_syslog_create_logging_context
+ *
+ * Tries to create a logging context with packet buffer
+ * to send a new logging packet
+ *
+ * Argument: cnat_syslog_logging_info_t *logging_info
+ * structure that contains the logging info and will store
+ * the packet context as well.
+ */
+inline
+void cnat_syslog_create_logging_context (
+ cnat_syslog_logging_info_t *logging_info,
+ syslog_service_type_t s_type)
+{
+ spp_ctx_t *ctx;
+
+ /*
+ * If queued_logging_context_index is non-EMPTY, we already have a logging
+ * packet queued to be sent. First try sending this before allocating
+ * a new context. We can have only one active packet context per
+ * logging_info structure
+ */
+
+ if (PREDICT_FALSE(logging_info->queued_logging_context != NULL)) {
+ cnat_syslog_send_queued_pkt(logging_info);
+ /*
+ * If we cannot still send the queued pkt, just return
+ * Downstream Constipation count would have increased anyway
+ */
+ if (logging_info->queued_logging_context != NULL) {
+ cnat_syslog_global_counter.logging_context_creation_deferred_count++;
+ return;
+ }
+ }
+
+ /*
+ * If no context can be allocated, return silently
+ * calling routine will handle updating the error counters
+ */
+ if (spp_ctx_alloc(&ctx, 1) < 1) {
+ cnat_syslog_global_counter.logging_context_creation_fail_count++;
+ SYSLOG_DEBUG_PRINTF1("\nCould not allocate ctx for syslog");
+ return;
+ }
+
+ // Allocate packet buffer (used for AVSM currently)
+ PLATFORM_ALLOC_NFV9_PKT_BUFFER(ctx, 0);
+
+ logging_info->current_logging_context = ctx;
+
+ PLATFORM_SET_CTX_RU_TX_FROM_NODE(ctx, NODE_LOGGING);
+
+ ctx->flags = SPP_CTX_END_OF_PACKET;
+ ctx->next_ctx_this_packet = (spp_ctx_t*) SPP_CTX_NO_NEXT_CTX;
+ ctx->current_header = &ctx->packet_data[CNAT_NFV9_HDR_OFFSET];
+
+ logging_info->pkt_length = syslog_fill_header(logging_info, s_type);
+ logging_info->pkt_length += (CNAT_NFV9_HDR_OFFSET -
+ CNAT_NFV9_IP_HDR_OFFSET);
+ logging_info->current_logging_context_timestamp =
+ cnat_get_sys_up_time_in_ms();
+
+}
+
+inline static int u16_to_ascii_hex_unaligned(
+ unsigned char *ptr, u16 num)
+{
+ unsigned char nibble, *temp;
+ int no_leading_zeros = 0;
+ temp = ptr;
+ nibble = (num >> 12);
+ if(nibble) {
+ *temp++ = hex_numbers_single_digit[nibble];
+ no_leading_zeros = 1;
+ }
+
+ nibble = (num >> 8) & 0xF;
+ if(nibble || no_leading_zeros) {
+ *temp++ = hex_numbers_single_digit[nibble];
+ no_leading_zeros = 1;
+ }
+
+ nibble = (num >> 4) & 0xF;
+ if(nibble || no_leading_zeros) {
+ *temp++ = hex_numbers_single_digit[nibble];
+ }
+
+ *temp++ = hex_numbers_single_digit[num & 0xF];
+
+ return temp-ptr;
+}
+
+inline static int ipv6_int_2_str(u32 ipv6[], unsigned char *ipv6_str)
+{
+/* DC stands for Double Colon.
+ * Refer http://tools.ietf.org/html/rfc5952 for
+ * more details on text representations of
+ * IPV6 address
+ */
+#define DC_NOT_USED_YET 0
+#define DC_IN_USE 1 /* Zeros are skipped */
+#define DC_ALREADY_USED 2 /* Cannot skip zeros anymore */
+ int i;
+ u16 *ipv6_temp = (u16 *)ipv6;
+ unsigned char *temp = ipv6_str;
+ int double_colon = DC_NOT_USED_YET;
+ for(i = 0; i < 7; i++) {
+ if(ipv6_temp[i]) {
+ ipv6_str += u16_to_ascii_hex_unaligned(ipv6_str, ipv6_temp[i]);
+ *ipv6_str++ = ':';
+ if(double_colon == DC_IN_USE) { /* Cannot use DC anymore */
+ double_colon = DC_ALREADY_USED;
+ }
+ } else {
+ if(double_colon == DC_IN_USE) {
+ /* Skip this zero as well */
+ continue;
+ } else if((ipv6_temp[i+1])
+ /* DC makes sense if there is more than one contiguous zero */
+ || (double_colon != DC_NOT_USED_YET)) {
+ ipv6_str += u16_to_ascii_hex_unaligned(ipv6_str,
+ ipv6_temp[i]);
+ *ipv6_str++ = ':';
+ } else { /* Start using DC */
+ *ipv6_str++ = ':'; /* The 2nd colon */
+ double_colon = DC_IN_USE;
+ }
+ }
+ }
+ if(ipv6_temp[7]) {
+ ipv6_str += u16_to_ascii_hex_unaligned(ipv6_str, ipv6_temp[7]);
+ } else if(double_colon != DC_IN_USE) {
+ *ipv6_str++ = '0';
+ }
+ *ipv6_str = 0;
+
+ return ipv6_str - temp;
+}
+
+/* insert syslog record for nat44 */
+
+void cnat_syslog_insert_nat44_record(
+ cnat_syslog_logging_info_t *log_info,
+ cnat_main_db_entry_t *db, cnat_vrfmap_t *vrfmap,
+ cnat_session_entry_t *sdb, int bulk_alloc, syslog_event_type_t e_type)
+{
+ /* This record should like this -
+ * [EventName <L4> <Original Source IP> <Inside VRF Name>
+ * <Original Source IPv6> < Translated Source IP> <Original Port>
+ * <Translated First Source Port> <Translated Last Source Port>
+ * <Destination ip address> <destination port>]
+ */
+ u32 original_source = db->in2out_key.k.ipv4;
+ u32 translated_ip = db->out2in_key.k.ipv4;
+ cnat_user_db_entry_t *udb = cnat_user_db + db->user_index;
+ unsigned char *temp, *record;
+ u32 network_order_ipv6[4];
+
+ SYSLOG_CONFIG_DEBUG_PRINTF(4,"In Function %s\n", __func__);
+ temp = record = &(log_info->current_logging_context->packet_data[
+ CNAT_NFV9_IP_HDR_OFFSET + log_info->pkt_length]);
+
+ if (PREDICT_FALSE(!udb)) {
+ SYSLOG_DEBUG_PRINTF1("\nnull udb!");
+ return;
+ }
+
+ /* Now we point to the location where record needs to be inserted */
+ *record++ = '['; /* Open the record */
+
+ /* Copy the record type */
+ memcpy(record, sys_log_event[e_type].event_name,
+ sys_log_event[e_type].name_length);
+ record += sys_log_event[e_type].name_length;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Protocol type */
+ if(PREDICT_FALSE(
+ e_type == sessionbased_assign || e_type == sessionbased_withdraw ||
+ e_type == sessionbased_assignD || e_type == sessionbased_withdrawD)) {
+ u16 my_proto_mask;
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+ if(PREDICT_TRUE(my_proto_mask == CNAT_TCP)) {
+ *record++ = '6';
+ } else if(PREDICT_TRUE(my_proto_mask == CNAT_UDP)) {
+ *record++ = '1';
+ *record++ = '7';
+ } else if(PREDICT_TRUE(my_proto_mask == CNAT_ICMP)) {
+ *record++ = '1';
+ } else { /* Default, assume GRE (for PPTP) */
+ *record++ = '4';
+ *record++ = '7';
+ }
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original Source IP */
+ record += copy_ipv4_addr(record, original_source);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* copy configured VRF NAME */
+ memcpy(record, log_info->vrf_name, log_info->vrf_name_len);
+ record += log_info->vrf_name_len;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* No IPV6 source address for nat44 */
+ *record++ = SYSLOG_FIELD_ABSENT;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the translated IP address */
+ record += copy_ipv4_addr(record, translated_ip);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original port */
+ if(e_type == sessionbased_assign || e_type == sessionbased_withdraw ||
+ e_type == sessionbased_assignD || e_type == sessionbased_withdrawD) {
+ record += u16_to_ascii_decimal_unaligned(
+ record, db->in2out_key.k.port);
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the start outside port */
+ record += u16_to_ascii_decimal_unaligned(record, bulk_alloc);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the last outside port */
+ if(e_type == userbased_assign || e_type == userbased_withdraw) {
+ record += u16_to_ascii_decimal_unaligned(record,
+ (bulk_alloc + BULKSIZE_FROM_VRFMAP(vrfmap) - 1));
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy destination ip and port in case for DBL*/
+ if(PREDICT_FALSE(e_type == sessionbased_assignD || e_type == sessionbased_withdrawD)) {
+ if(PREDICT_TRUE(sdb == NULL)) {
+ record += copy_ipv4_addr(record,db->dst_ipv4);
+ *record++ = SYSLOG_DELIMITER;
+ record += u16_to_ascii_decimal_unaligned(record, db->dst_port);
+ } else {
+ record += copy_ipv4_addr(record, sdb->v4_dest_key.k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+ record += u16_to_ascii_decimal_unaligned(record, sdb->v4_dest_key.k.port);
+ }
+ } else {
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+ *record++ = '-';
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = ']'; /* End of the reocrd */
+
+ log_info->pkt_length += record - temp;
+}
+
+void cnat_syslog_insert_record(
+ cnat_syslog_logging_info_t *log_info,
+ cnat_main_db_entry_t *db, dslite_table_entry_t *dslite_entry,
+ cnat_session_entry_t *sdb, int bulk_alloc, syslog_event_type_t e_type)
+{
+ /* This record should like this -
+ * [EventName <L4> <Original Source IP> <Inside VRF Name>
+ * <Original Source IPv6> < Translated Source IP> <Original Port>
+ * <Translated First Source Port> <Translated Last Source Port>
+ * <Destination ip address> <destination port>]
+ */
+ u32 original_source = db->in2out_key.k.ipv4;
+ u32 translated_ip = db->out2in_key.k.ipv4;
+ cnat_user_db_entry_t *udb = cnat_user_db + db->user_index;
+ unsigned char *temp, *record;
+ u32 network_order_ipv6[4];
+
+ temp = record = &(log_info->current_logging_context->packet_data[
+ CNAT_NFV9_IP_HDR_OFFSET + log_info->pkt_length]);
+
+ if (PREDICT_FALSE(!udb)) {
+ SYSLOG_DEBUG_PRINTF1("\nnull udb!");
+ return;
+ }
+
+ /* Now we point to the location where record needs to be inserted */
+ *record++ = '['; /* Open the record */
+
+ /* Copy the record type */
+ memcpy(record, sys_log_event[e_type].event_name,
+ sys_log_event[e_type].name_length);
+ record += sys_log_event[e_type].name_length;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Protocol type */
+ if(PREDICT_FALSE(
+ e_type == sessionbased_assign || e_type == sessionbased_withdraw ||
+ e_type == sessionbased_assignD || e_type == sessionbased_withdrawD)) {
+ u16 my_proto_mask;
+ my_proto_mask = db->in2out_key.k.vrf & CNAT_PRO_MASK;
+ if(PREDICT_TRUE(my_proto_mask == CNAT_TCP)) {
+ *record++ = '6';
+ } else if(PREDICT_TRUE(my_proto_mask == CNAT_UDP)) {
+ *record++ = '1';
+ *record++ = '7';
+ } else {
+ *record++ = '1';
+ }
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original Source IP */
+#ifdef DSLITE_USER_IPV4
+ record += copy_ipv4_addr(record, original_source);
+#else
+ /*
+ * Do not include inside ipv4 address for B4 element level port limiting
+ */
+ *record++ = SYSLOG_FIELD_ABSENT;
+#endif
+ *record++ = SYSLOG_DELIMITER;
+
+ /* copy configured VRF NAME */
+ memcpy(record, log_info->vrf_name, log_info->vrf_name_len);
+ record += log_info->vrf_name_len;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the IPV6 source address */
+ /* CSCtt16960 Fix. */
+ network_order_ipv6[0] = htonl(udb->ipv6[0]);
+ network_order_ipv6[1] = htonl(udb->ipv6[1]);
+ network_order_ipv6[2] = htonl(udb->ipv6[2]);
+ network_order_ipv6[3] = htonl(udb->ipv6[3]);
+
+ inet_ntop(AF_INET6,network_order_ipv6,record,INET6_ADDRSTRLEN);
+ record += strlen(record);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the translated IP address */
+ record += copy_ipv4_addr(record, translated_ip);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original port */
+ if(e_type == sessionbased_assign || e_type == sessionbased_withdraw ||
+ e_type == sessionbased_assignD || e_type == sessionbased_withdrawD) {
+ record += u16_to_ascii_decimal_unaligned(
+ record, db->in2out_key.k.port);
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the start outside port */
+ record += u16_to_ascii_decimal_unaligned(record, bulk_alloc);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the last outside port */
+ if(e_type == userbased_assign || e_type == userbased_withdraw) {
+ record += u16_to_ascii_decimal_unaligned(record,
+ (bulk_alloc + BULKSIZE_FROM_VRFMAP(dslite_entry) - 1));
+ } else {
+ *record++ = SYSLOG_FIELD_ABSENT;
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ if(PREDICT_FALSE(e_type == sessionbased_assignD || e_type == sessionbased_withdrawD)) {
+ if(sdb == NULL) {
+ record += copy_ipv4_addr(record, db->dst_ipv4);
+ *record++ = SYSLOG_DELIMITER;
+ record += u16_to_ascii_decimal_unaligned(record, db->dst_port);
+ } else {
+ record += copy_ipv4_addr(record, sdb->v4_dest_key.k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+ record += u16_to_ascii_decimal_unaligned(record, sdb->v4_dest_key.k.port);
+ }
+ } else {
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+ *record++ = '-';
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = ']'; /* End of the reocrd */
+
+ log_info->pkt_length += record - temp;
+}
+
+#define SYSLOG_PRECHECK(entry, s_type) \
+ if(PREDICT_FALSE((entry)->syslog_logging_index == EMPTY)) { \
+ SYSLOG_DEBUG_PRINTF1("\n1. Log Mapping failed") \
+ return; \
+ } \
+ logging_info = \
+ cnat_syslog_logging_info_pool + (entry)->syslog_logging_index; \
+ if(PREDICT_FALSE(logging_info->current_logging_context == NULL)) { \
+ cnat_syslog_create_logging_context(logging_info, s_type); \
+ if(PREDICT_FALSE(logging_info->current_logging_context == NULL)) { \
+ SYSLOG_DEBUG_PRINTF1("\n2. Log Mapping failed") \
+ return; \
+ } \
+ }
+
+void cnat_syslog_nat44_mapping_create(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t * sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+ syslog_event_type_t e_type;
+ int start_port;
+
+ SYSLOG_CONFIG_DEBUG_PRINTF(4,"In Function %s\n", __func__);
+ SYSLOG_PRECHECK(vrfmap, NAT44)
+
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ e_type = userbased_assign;
+ start_port = bulk_alloc;
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED) {
+ return; /* No logging required.. bulk port usage */
+ }
+ else { /* Individual logging .. fall back to old method */
+#endif
+ if(vrfmap->syslog_logging_policy == SESSION_LOG_ENABLE) {
+ e_type = sessionbased_assignD;
+ } else {
+ e_type = sessionbased_assign;
+ }
+ start_port = db->out2in_key.k.port;
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+
+ cnat_syslog_insert_nat44_record(logging_info, db, vrfmap, sdb,
+ start_port, e_type);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_ds_lite_mapping_create(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+ syslog_event_type_t e_type;
+ int start_port;
+
+ SYSLOG_PRECHECK(dslite_entry, DSLite)
+
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ e_type = userbased_assign;
+ start_port = bulk_alloc;
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED) {
+ return; /* No logging required.. bulk port usage */
+ }
+ else { /* Individual logging .. fall back to old method */
+#endif
+ if(PREDICT_FALSE(dslite_entry->syslog_logging_policy == SESSION_LOG_ENABLE)) {
+ e_type = sessionbased_assignD;
+ } else {
+ e_type = sessionbased_assign;
+ }
+ start_port = db->out2in_key.k.port;
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+
+ cnat_syslog_insert_record(logging_info, db, dslite_entry, sdb,
+ start_port, e_type);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_nat44_mapping_delete(cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+ syslog_event_type_t e_type;
+ int start_port;
+
+ SYSLOG_CONFIG_DEBUG_PRINTF(4,"In Function %s\n", __func__);
+ SYSLOG_PRECHECK(vrfmap, NAT44)
+
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ e_type = userbased_withdraw;
+ start_port = bulk_alloc;
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED) {
+ return; /* No logging required.. bulk port usage */
+ }
+ else { /* Individual logging .. fall back to old method */
+#endif
+ if(vrfmap->syslog_logging_policy == SESSION_LOG_ENABLE) {
+ e_type = sessionbased_withdrawD;
+ } else {
+ e_type = sessionbased_withdraw;
+ }
+ start_port = db->out2in_key.k.port;
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+ cnat_syslog_insert_nat44_record(logging_info, db, vrfmap, sdb,
+ start_port, e_type);
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_ds_lite_mapping_delete(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ )
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+ syslog_event_type_t e_type;
+ int start_port;
+
+ SYSLOG_PRECHECK(dslite_entry, DSLite)
+
+#ifndef NO_BULK_LOGGING
+ if(bulk_alloc > 0) { /* new bulk alloc - use bulk add template */
+ e_type = userbased_withdraw;
+ start_port = bulk_alloc;
+ } else if(bulk_alloc == CACHE_ALLOC_NO_LOG_REQUIRED) {
+ return; /* No logging required.. bulk port usage */
+ }
+ else { /* Individual logging .. fall back to old method */
+#endif
+ if(PREDICT_FALSE(dslite_entry->syslog_logging_policy == SESSION_LOG_ENABLE)) {
+ e_type = sessionbased_withdrawD;
+ } else {
+ e_type = sessionbased_withdraw;
+ }
+ start_port = db->out2in_key.k.port;
+#ifndef NO_BULK_LOGGING
+ }
+#endif
+ cnat_syslog_insert_record(logging_info, db, dslite_entry, sdb,
+ start_port, e_type);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_dslite_insert_port_exceeded(
+ cnat_syslog_logging_info_t *log_info,
+ dslite_key_t * key)
+{
+ /* This record should like this -
+ * [Portblockrunout <L4> <Original Source IP> <Inside VRF Name>
+ * <Original Source IPv6> - <Original Port> - - - -]
+ */
+ u32 network_order_ipv6[4];
+ unsigned char *temp, *record;
+
+ temp = record = &(log_info->current_logging_context->packet_data[
+ CNAT_NFV9_IP_HDR_OFFSET + log_info->pkt_length]);
+
+ /* Now we point to the location where record needs to be inserted */
+ *record++ = '['; /* Open the record */
+
+ /* Copy the record type */
+ memcpy(record, sys_log_event[port_block_runout].event_name,
+ sys_log_event[port_block_runout].name_length);
+ record += sys_log_event[port_block_runout].name_length;
+ *record++ = SYSLOG_DELIMITER;
+
+ u16 my_proto_mask;
+ my_proto_mask = key->ipv4_key.k.vrf & CNAT_PRO_MASK;
+ if(PREDICT_TRUE(my_proto_mask == CNAT_TCP)) {
+ *record++ = '6';
+ } else if(PREDICT_TRUE(my_proto_mask == CNAT_UDP)) {
+ *record++ = '1';
+ *record++ = '7';
+ } else {
+ *record++ = '1';
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original Source IP */
+ record += copy_ipv4_addr(record, key->ipv4_key.k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* copy configured VRF NAME */
+ memcpy(record, log_info->vrf_name, log_info->vrf_name_len);
+ record += log_info->vrf_name_len;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the IPV6 source address */
+ network_order_ipv6[0] = htonl(key->ipv6[0]);
+ network_order_ipv6[1] = htonl(key->ipv6[1]);
+ network_order_ipv6[2] = htonl(key->ipv6[2]);
+ network_order_ipv6[3] = htonl(key->ipv6[3]);
+
+ inet_ntop(AF_INET6,network_order_ipv6,record,INET6_ADDRSTRLEN);
+ record += strlen(record);
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = SYSLOG_FIELD_ABSENT; /* No translated source ip */
+ *record++ = SYSLOG_DELIMITER;
+
+ record += u16_to_ascii_decimal_unaligned(
+ record, key->ipv4_key.k.port);
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = SYSLOG_FIELD_ABSENT; /* No translated start port */
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = SYSLOG_FIELD_ABSENT; /* No translated end port */
+ *record++ = SYSLOG_DELIMITER;
+
+ /*No Destination Info*/
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = ']'; /* End of the reocrd */
+
+ log_info->pkt_length += record - temp;
+}
+
+void cnat_syslog_ds_lite_port_limit_exceeded(
+ dslite_key_t * key,
+ dslite_table_entry_t *dslite_entry)
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+
+ SYSLOG_PRECHECK(dslite_entry, DSLite)
+
+ cnat_syslog_dslite_insert_port_exceeded(logging_info, key);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_nat44_insert_port_exceeded(
+ cnat_syslog_logging_info_t *log_info,
+ cnat_key_t * key)
+{
+ /* This record should like this -
+ * [Portblockrunout <L4> <Original Source IP> <Inside VRF Name>
+ * - - <Original Port> - - - -]
+ */
+ unsigned char *temp, *record;
+
+ temp = record = &(log_info->current_logging_context->packet_data[
+ CNAT_NFV9_IP_HDR_OFFSET + log_info->pkt_length]);
+
+ /* Now we point to the location where record needs to be inserted */
+ *record++ = '['; /* Open the record */
+
+ /* Copy the record type */
+ memcpy(record, sys_log_event[port_block_runout].event_name,
+ sys_log_event[port_block_runout].name_length);
+ record += sys_log_event[port_block_runout].name_length;
+ *record++ = SYSLOG_DELIMITER;
+
+ u16 my_proto_mask;
+ my_proto_mask = key->k.vrf & CNAT_PRO_MASK;
+ if(PREDICT_TRUE(my_proto_mask == CNAT_TCP)) {
+ *record++ = '6';
+ } else if(PREDICT_TRUE(my_proto_mask == CNAT_UDP)) {
+ *record++ = '1';
+ *record++ = '7';
+ } else {
+ *record++ = '1';
+ }
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original Source IP */
+ record += copy_ipv4_addr(record, key->k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* copy configured VRF NAME */
+ memcpy(record, log_info->vrf_name, log_info->vrf_name_len);
+ record += log_info->vrf_name_len;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* No IPV6 source address for nat44 */
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = '-'; /* No translated source ip */
+ *record++ = SYSLOG_DELIMITER;
+
+ record += u16_to_ascii_decimal_unaligned(
+ record, key->k.port);
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = '-'; /* No translated start port */
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = '-'; /* No translated end port */
+ *record++ = SYSLOG_DELIMITER;
+
+ /*No Destination Info*/
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = ']'; /* End of the reocrd */
+
+ log_info->pkt_length += record - temp;
+}
+
+void cnat_syslog_nat44_port_limit_exceeded(
+ cnat_key_t * key,
+ cnat_vrfmap_t *vrfmap)
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+
+ SYSLOG_PRECHECK(vrfmap, NAT44)
+
+ cnat_syslog_nat44_insert_port_exceeded(logging_info, key);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+
+void cnat_syslog_nat44_insert_tcp_seq_mismatch(
+ cnat_syslog_logging_info_t *log_info,
+ cnat_main_db_entry_t *db)
+{
+ /* This record should like this -
+ * [TCPseqmismatch <L4> <Original Source IP> <Inside VRF Name>
+ * - <Translated Source IP> <Original Port> <Translated Source Port> - - -]
+ */
+ unsigned char *temp, *record;
+
+ temp = record = &(log_info->current_logging_context->packet_data[
+ CNAT_NFV9_IP_HDR_OFFSET + log_info->pkt_length]);
+
+ /* Now we point to the location where record needs to be inserted */
+ *record++ = '['; /* Open the record */
+
+ /* Copy the record type */
+ memcpy(record, sys_log_event[tcp_seq_mismatch].event_name,
+ sys_log_event[tcp_seq_mismatch].name_length);
+ record += sys_log_event[tcp_seq_mismatch].name_length;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Next field is TCP */
+ *record++ = '6';
+ *record++ = SYSLOG_DELIMITER;
+
+ /* Copy the Original Source IP */
+ record += copy_ipv4_addr(record, db->in2out_key.k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+
+ /* copy configured VRF NAME */
+ memcpy(record, log_info->vrf_name, log_info->vrf_name_len);
+ record += log_info->vrf_name_len;
+ *record++ = SYSLOG_DELIMITER;
+
+ /* No IPV6 source address for nat44 */
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+
+ record += copy_ipv4_addr(record, db->out2in_key.k.ipv4);
+ *record++ = SYSLOG_DELIMITER;
+
+ record += u16_to_ascii_decimal_unaligned(
+ record, db->in2out_key.k.port);
+ *record++ = SYSLOG_DELIMITER;
+
+ record += u16_to_ascii_decimal_unaligned(
+ record, db->out2in_key.k.port);
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = '-'; /* No translated end port */
+ *record++ = SYSLOG_DELIMITER;
+
+ /*No Destination Info*/
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+ *record++ = '-';
+ *record++ = SYSLOG_DELIMITER;
+
+ *record++ = ']'; /* End of the reocrd */
+
+ log_info->pkt_length += record - temp;
+}
+
+void cnat_syslog_nat44_tcp_seq_mismatch(
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap)
+{
+ cnat_syslog_logging_info_t *logging_info = 0;
+
+ SYSLOG_PRECHECK(vrfmap, NAT44)
+
+ cnat_syslog_nat44_insert_tcp_seq_mismatch(logging_info, db);
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+
+ if (PREDICT_FALSE(logging_info->pkt_length >
+ logging_info->max_length_minus_max_record_size)) {
+ cnat_syslog_send_pkt(logging_info);
+ }
+}
+#endif
diff --git a/vnet/vnet/vcgn/cnat_syslog.h b/vnet/vnet/vcgn/cnat_syslog.h
new file mode 100644
index 00000000000..931f4b9cd22
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_syslog.h
@@ -0,0 +1,190 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_syslog.h
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_SYSLOG_H__
+#define __CNAT_SYSLOG_H__
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+
+#include "cnat_db.h"
+#include "nat64_db.h"
+#include "cnat_log_common.h"
+#include "dslite_defs.h"
+
+#define SYSLOG_CONFIG_DEBUG_PRINTF(level, ...) \
+ if (config_debug_level > level) PLATFORM_DEBUG_PRINT(__VA_ARGS__);
+
+
+/* one time call at the beginning */
+void cnat_syslog_logging_init();
+
+/*
+ * unconditional call
+ * will check logging config inside
+ */
+void cnat_syslog_log_mapping_create(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap);
+
+/*
+ * unconditional call
+ * will check logging config inside
+ */
+void cnat_syslog_log_mapping_delete(cnat_main_db_entry_t * db,
+ cnat_vrfmap_t *vrfmap);
+
+void cnat_syslog_ds_lite_mapping_create(cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry, cnat_session_entry_t *sdb
+#ifndef NO_BULK_LOGGING
+ , int bulk_alloc
+#endif
+ );
+
+void cnat_syslog_ds_lite_port_limit_exceeded(
+ dslite_key_t * key,
+ dslite_table_entry_t *dslite_entry);
+
+#define SYSLOG_TIMESTAMP_LENGTH 20
+
+#define CNAT_SYSLOG_VERSION_NUMBER 1
+#define CNAT_SYSLOG_PRIORITY 16*8+6
+/* facility = local0 + severity = info */
+
+#define MAX_SYSLOG_HOSTNAME_LEN 32
+
+/* 6 for priority + space
+ * 2 for version + space
+ * 21 YYYY MMM DD HH:MM:SS + space
+ * 33 for hostname + space
+ * 4 for App Name (-) + space + Proc ID (-) + space
+ * 7 for Msg ID (DSLite is the longest Msg ID so far + space
+ * 2 for Structured data (-) + space
+ */
+#define MAX_SYSLOG_HEADER_LEN 75
+
+/* 18 for Event Name (Portblockrunout is the longest as of now)
+ * 3 for L4 (including space)
+ * 16 for original souce IP + space
+ * 33 for inside vrf name + space
+ * 40 for original source IPV6 + space
+ * 16 for translated source IP + space
+ * 6 for original port + space
+ * 6 for translated first source port + space
+ * 5 for translated last source port
+ * 2 for [] enclosure
+ */
+#define MAX_SYSLOG_RECORD_LEN 145
+
+typedef enum {
+ NAT44,
+ DSLite
+} syslog_service_type_t;
+
+typedef enum {
+ userbased_assign,
+ userbased_withdraw,
+ sessionbased_assign,
+ sessionbased_withdraw,
+ sessionbased_assignD,
+ sessionbased_withdrawD,
+ port_block_runout,
+ tcp_seq_mismatch,
+ max_syslog_event_type
+} syslog_event_type_t;
+
+/*
+ * This structure store the Syslog Logging information on per
+ * collector basis. This structure is allocated from a pool and index
+ * to this structure is stored VRF MAP structures
+ */
+typedef struct {
+ /*
+ * nat64_id will be 0 for nat44 config and i_vrf_id, i_vrf will be 0
+ * for nat64 config. Nat64_id will be used while nat64 collector is
+ * search and i_vrf* for nat44 collector
+ */
+ /* Similarly for ds_lite, ds_lite_id will be used and nat64_id,
+ * ivrf_id shall be set to 0
+ */
+ u32 i_vrf_id; /* Inside VRF ID corresponding to this collector */
+ u16 i_vrf; /* Inside VRF (uidb_index) corresponding to this collector */
+ u16 ds_lite_id; /* DS Lite instance for this collector */
+ u16 port; /* Destination port number of the collector */
+
+ /*
+ * This field determines the maximum size of the Syslog information
+ * that can be stored in a logging packet
+ */
+ u16 max_length_minus_max_record_size;
+ u32 ipv4_address; /* Destination IP address of the collector */
+ /*
+ * Timestamp in UNIX seconds corresponding to when the current
+ * logging packet was created
+ */
+ u32 current_logging_context_timestamp;
+
+ /*
+ * Indicates if the entry is already deleted
+ */
+ u8 deleted;
+
+ u8 header_priority;
+ u16 pkt_length;
+
+ char header_hostname[MAX_SYSLOG_HOSTNAME_LEN];
+ char vrf_name[VRF_NAME_LEN_STORED];
+ u16 vrf_name_len;
+ u8 logging_policy;
+ /*
+ * current logging context
+ */
+ spp_ctx_t *current_logging_context;
+ spp_ctx_t *queued_logging_context;
+
+} cnat_syslog_logging_info_t;
+
+
+/*
+ * Global structure for CGN APP configuration
+ */
+typedef struct {
+
+ u16 cnat_syslog_disp_node_index;
+
+ /*
+ * Whether we have initialized the Syslog information
+ */
+ u8 cnat_syslog_init_done;
+
+} cnat_syslog_global_info_t;
+
+typedef struct {
+ u64 logging_context_creation_fail_count;
+ u64 downstream_constipation_count;
+ u64 logging_context_creation_deferred_count;
+} cnat_syslog_global_counters_t;
+
+extern cnat_syslog_logging_info_t *cnat_syslog_logging_info_pool;
+extern cnat_syslog_global_info_t cnat_syslog_global_info;
+
+#define SYSLOG_DEF_PATH_MTU 1500
+
+#endif /* __CNAT_SYSLOG_H__ */
diff --git a/vnet/vnet/vcgn/cnat_util.c b/vnet/vnet/vcgn/cnat_util.c
new file mode 100644
index 00000000000..2415c5437fd
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_util.c
@@ -0,0 +1,2257 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_util.c - cnat helpers
+ *
+ * Copyright (c) 2009-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+#include <vppinfra/error.h>
+
+#include "tcp_header_definitions.h"
+
+#if 0
+void spp_api_cnat_v4_config_dummy_t_handler
+(spp_api_cnat_v4_config_dummy_t *mp);
+
+void spp_api_cnat_v4_config_dummy_max_t_handler
+(spp_api_cnat_v4_config_dummy_max_t *mp);
+
+void spp_api_cnat_v4_config_icmp_timeout_t_handler
+(spp_api_cnat_v4_config_icmp_timeout_t *mp);
+
+void spp_api_cnat_clear_db_request_t_handler
+(spp_api_cnat_clear_db_request_t *mp);
+
+void spp_api_cnat_v4_debug_global_t_handler
+(spp_api_cnat_v4_debug_global_t *mp);
+
+void spp_api_cnat_v4_show_outside_entry_req_t_handler
+(spp_api_cnat_v4_show_outside_entry_req_t *mp);
+
+void spp_api_cnat_v4_show_inside_entry_req_t_handler
+(spp_api_cnat_v4_show_inside_entry_req_t *mp);
+
+void spp_api_cnat_show_statistics_summary_req_t_handler
+(spp_api_cnat_show_statistics_summary_req_t *mp);
+
+void cnat_db_create_db_entries_cmd (int argc, unsigned long *argv)
+{
+ int i, j ;
+ int nusers = 3000;
+
+ cnat_db_key_bucket_t key_info;
+ cnat_main_db_entry_t *db;
+ cnat_gen_icmp_info info;
+ cnat_key_t dest_info_dummy;
+
+ memset(&dest_info_dummy, 0, sizeof(cnat_key_t));
+ printf ("Create %d users, 100 translations each...\n", nusers);
+
+ for (i = 0; i < nusers; i++) {
+ for (j = 0; j < 100; j++) {
+
+ key_info.k.k.vrf = CNAT_TCP | (1 & CNAT_VRF_MASK);
+ key_info.k.k.ipv4 = 0x0c000001+i;
+ key_info.k.k.port = 1024+j;
+
+ db = cnat_get_main_db_entry_v2(&key_info, PORT_SINGLE,
+ PORT_TYPE_DYNAMIC, &info, &dest_info_dummy);
+
+ if (db == 0) {
+ printf ("OOPS: cnat_main_db_create failed users %d trans %d\n", i, j);
+ return; /*jli*/
+ }
+
+ db->entry_expires = cnat_current_time;
+
+ }
+ }
+}
+
+void db_test_clear (int argc, unsigned long *argv)
+{
+ spp_api_cnat_clear_db_request_t mp;
+
+ mp.wildcard = argv[0];
+ mp.protocol = argv[1];
+ mp.port_num = argv[2];
+ mp.inside_vrf = argv[3];
+ mp.ip_addr = argv[4];
+ spp_api_cnat_clear_db_request_t_handler(&mp);
+}
+
+/* test code*/
+void cnat_db_test_show (int argc, unsigned long *argv)
+{
+
+ spp_api_cnat_v4_show_inside_entry_req_t mp1[2000];
+ spp_api_cnat_v4_show_outside_entry_req_t mp2[30000];
+
+ if (argc > 1) {
+ if (argc != 7) {
+ printf("Usage: db test show dec((which)) dec((vrf)) dec((proto)) dec((ip)) dec((start_port)) dec((end_port)) dec((flags))\n");
+ return;
+ }
+
+ mp1[0].vrf_id = argv[1];
+ mp1[0].protocol = argv[2];;
+ mp1[0].ipv4_addr = argv[3];
+ mp1[0].start_port = argv[4];
+ mp1[0].end_port = argv[5];
+ mp1[0].flags = argv[6];
+ mp1[0].all_entries = 0;
+
+ if (argv[0] == 1) {
+ spp_api_cnat_v4_show_inside_entry_req_t_handler (&(mp1[0]));
+ } else {
+ spp_api_cnat_v4_show_outside_entry_req_t_handler (&(mp2[0]));
+ }
+
+ return;
+ } else {
+ printf("inside entries \n");
+ mp1[0].ipv4_addr = 0x016994CA;
+ mp1[0].vrf_id = 1;
+ mp1[0].all_entries = 0;
+ mp1[0].start_port = 32765;
+ mp1[0].end_port = 65535;
+ mp1[0].protocol = 2;
+ mp1[0].flags = 3;
+
+ spp_api_cnat_v4_show_inside_entry_req_t_handler (&(mp1[0]));
+
+ mp2[0].ipv4_addr = 0x640200c1;
+ mp2[0].vrf_id = 0;
+ mp2[0].start_port = 1025;
+ mp2[0].end_port = 62235;
+ mp2[0].protocol = 2;
+ mp2[0].flags = 3;
+
+ spp_api_cnat_v4_show_outside_entry_req_t_handler (&(mp2[0]));
+ }
+
+#if 1
+ {
+ spp_api_cnat_stats_node_mapping_t mp3[20000];
+ spp_api_cnat_stats_counter_mapping_t mp4[20000];
+ spp_api_cnat_stats_counter_values_t mp5[23000];
+
+ mp3[0].rc = 0;
+ spp_api_cnat_stats_node_mapping_t_handler (&mp3);
+ mp4[0].rc = 0;
+ spp_api_cnat_stats_counter_mapping_t_handler (&mp4);
+
+ mp5[0].flag = 1;
+ spp_api_cnat_stats_counter_values_t_handler(&mp5);
+ }
+#endif
+
+#if 0
+ mp1.ipv4_addr = 0x0A010102;
+ mp1.vrf_id = 1;
+ mp1.all_entries = 1;
+ mp1.protocol = 1;
+
+ spp_api_cnat_v4_show_inside_entry_req_t_handler (&mp1);
+
+
+ mp1.ipv4_addr = 0x0A010103;
+ mp1.vrf_id = 1;
+ mp1.all_entries = 1;
+ mp1.protocol = 2;
+
+ spp_api_cnat_v4_show_inside_entry_req_t_handler (&mp1);
+
+ mp6[0].inside_vrf_id = 1;
+ mp6[0].start_ipv4_address = 0x64020001;
+ mp6[0].end_ipv4_address = 0x64020101;
+ mp6[0].free_addr = 0;
+ mp6[0].flags = CNAT_TRANSLATION_ENTRY_STATIC;
+
+ spp_api_cnat_v4_show_freeUsed_entry_req_t_handler(&mp6);
+
+#endif
+ printf("returned here");
+
+ return;
+}
+
+
+
+void cnat_db_clear_all_entries (int argc, unsigned long *argv)
+{
+ cnat_main_db_entry_t * db;
+ u32 index;
+
+ pool_header_t * p = vec_header(cnat_main_db, sizeof(pool_header_t) );
+
+ for(index = 0; index < vec_len(cnat_main_db); index++) {
+
+ if ( !clib_bitmap_get(p->free_bitmap, index)) {
+
+ db = cnat_main_db + index;
+ cnat_delete_main_db_entry_v2(db);
+
+ }
+ }
+
+}
+
+
+void spp_log_cmd (int argc, unsigned long *argv)
+{
+ u16 num_traces;
+ u16 error_code;
+ u16 num_args;
+ u32 arg[7];
+ u8 i;
+
+ num_traces = argv[0];
+
+ for (i = 0; i < num_traces; i++) {
+ error_code = argv[1 + 4*i];
+ num_args = argv[2 + 4*i];
+ arg[0] = argv[3 + 4*i];
+ arg[1] = argv[4 + 4*i];
+
+ spp_printf(error_code, num_args, arg);
+ }
+}
+
+
+void cnat_db_create_random_entries (int argc, unsigned long *argv)
+{
+
+ platform_cnat_db_create_random_entries();
+}
+
+#define MAX_DEPTH 10
+
+void show_user_db_hash_chain_len() {
+
+ u32 max_len, len, n, i, max_idx, index, used;
+ cnat_user_db_entry_t * udb;
+ u32 hash_depth[MAX_DEPTH];
+
+ memset(hash_depth, 0, sizeof(u32)*MAX_DEPTH);
+
+ n = vec_len(cnat_user_hash);
+
+ used = 0;
+ max_len = 0;
+ for(i=0;i<n;i++) {
+
+ index = cnat_user_hash[i].next;
+
+ len = 0;
+ if (index != EMPTY) used++;
+
+ while (index != EMPTY) {
+ len++ ;
+ udb = cnat_user_db + index;
+ index = udb->user_hash.next;
+ }
+
+ if(len < (MAX_DEPTH-1) ) {
+ hash_depth[len]++;
+ } else {
+ hash_depth[MAX_DEPTH-1]++;
+ }
+
+ if (max_len < len) {
+ max_len = len;
+ max_idx = cnat_user_hash[i].next;
+ }
+ }
+
+ printf("Max user db hash length %u, total buckets %u used %u\n",
+ max_len, n, used);
+
+ for( i=1; i<(MAX_DEPTH - 1); i++) {
+ printf("Hash chain len %02d, entries count %d\n", i, hash_depth[i]);
+ }
+
+ printf("Hash chain len >%02d, entries count %d\n",
+ MAX_DEPTH-1, hash_depth[MAX_DEPTH-1]);
+
+}
+
+void show_main_db_hash_chain_len() {
+
+ u32 max_len, len, n, i, max_idx, index, used;
+ cnat_main_db_entry_t * db;
+ u32 hash_depth[MAX_DEPTH];
+
+ memset(hash_depth, 0, sizeof(u32)*MAX_DEPTH);
+
+ n = vec_len(cnat_in2out_hash);
+
+ used = 0;
+ max_len = 0;
+ for(i=0;i<n;i++) {
+
+ index = cnat_in2out_hash[i].next;
+
+ len = 0;
+ if (index != EMPTY) used++;
+
+ while (index != EMPTY) {
+ len++ ;
+ db = cnat_main_db + index;
+ index = db->in2out_hash.next;
+ }
+
+ if(len < (MAX_DEPTH-1) ) {
+ hash_depth[len]++;
+ } else {
+ hash_depth[MAX_DEPTH-1]++;
+ }
+
+ if (max_len < len) {
+ max_len = len;
+ max_idx = cnat_in2out_hash[i].next;
+ }
+ }
+
+ printf("Max main db I2O hash length %u, total buckets %u used %u\n",
+ max_len, n, used);
+
+ for( i=1; i<(MAX_DEPTH - 1); i++) {
+ printf("Hash chain len %02d, entries count %d\n", i, hash_depth[i]);
+ }
+
+ printf("Hash chain len >%02d, entries count %d\n",
+ MAX_DEPTH-1, hash_depth[MAX_DEPTH-1]);
+
+
+ memset(hash_depth, 0, sizeof(u32)*MAX_DEPTH);
+
+ n = vec_len(cnat_out2in_hash);
+ used = 0;
+ max_len = 0;
+
+ for(i=0;i<n;i++) {
+
+ index = cnat_out2in_hash[i].next;
+ len = 0;
+
+ if (index != EMPTY) used++;
+
+ while (index != EMPTY) {
+ len++ ;
+ db = cnat_main_db + index;
+ index = db->out2in_hash.next;
+ }
+
+ if(len < (MAX_DEPTH-1) ) {
+ hash_depth[len]++;
+ } else {
+ hash_depth[MAX_DEPTH-1]++;
+ }
+
+ if (max_len < len) {
+ max_len = len;
+ max_idx = cnat_in2out_hash[i].next;
+ }
+ }
+
+ printf("Max main db O2I hash length %u, total buckets %u used %u\n",
+ max_len, n, used);
+
+ for( i=1; i<(MAX_DEPTH - 1); i++) {
+ printf("Hash chain len %02d, entries count %d\n", i, hash_depth[i]);
+ }
+
+ printf("Hash chain len >%02d, entries count %d\n",
+ MAX_DEPTH-1, hash_depth[MAX_DEPTH-1]);
+
+
+}
+
+u32 db_free_entry (void * p) {
+
+ pool_header_t * h;
+ u32 free;
+
+ h = pool_header(p);
+
+ free = p == 0 ? 0: vec_len(h->free_indices);
+
+ return free;
+}
+
+void cnat_db_summary (int argc, unsigned long *argv) {
+
+ PLATFORM_DEBUG_PRINT("\n-----------------------------------------");
+ PLATFORM_DEBUG_PRINT("\nSummary DB");
+ PLATFORM_DEBUG_PRINT("\n-----------------------------------------\n");
+ u32 count1, count2, i;
+#ifndef NO_NAT64_DEF
+ extern void nat64_session_db_summary();
+#endif
+ /* main db active entry count*/
+ count1 = vec_len(cnat_main_db);
+ count2 = db_free_entry(cnat_main_db);
+
+ PLATFORM_DEBUG_PRINT("main db entries: total %u, active %u, free %u\n", count1, count1 - count2, count2);
+
+ /* user db active entry count */
+ count1 = vec_len(cnat_user_db);
+ count2 = db_free_entry(cnat_user_db);
+
+ PLATFORM_DEBUG_PRINT("user db entries: total %u, active %u, free %u\n", count1, count1 - count2, count2);
+
+
+ /* user db active entry count */
+#ifndef NO_NAT64_DEF
+ nat64_session_db_summary();
+#endif
+
+ /* main db hash i2o o2i usage count */
+ count1 = 0;
+ count2 = 0;
+
+ for (i=0; i< CNAT_MAIN_HASH_SIZE; i++) {
+
+ if(cnat_in2out_hash[i].next != ~0) count1++;
+ if(cnat_out2in_hash[i].next != ~0) count2++;
+
+ }
+
+ PLATFORM_DEBUG_PRINT("main hash in2out: total %6u, used %u (%.2f%%)\n",
+ CNAT_MAIN_HASH_SIZE, count1,
+ (100.0*count1)/CNAT_MAIN_HASH_SIZE);
+
+ PLATFORM_DEBUG_PRINT("main hash out2in: total %6u, used %u (%.2f%%)\n",
+ CNAT_MAIN_HASH_SIZE, count2,
+ (100.0 * count1)/CNAT_MAIN_HASH_SIZE);
+
+ /* use db hashing usage count */
+
+ count1 = 0;
+
+ for (i=0; i< CNAT_USER_HASH_SIZE; i++) {
+ if(cnat_user_hash[i].next != ~0) count1++;
+ }
+
+ PLATFORM_DEBUG_PRINT("user db hash: total %6u, used %u (%.2f%%)\n",
+ CNAT_USER_HASH_SIZE, count1,
+ (100.0*count1)/CNAT_USER_HASH_SIZE);
+
+ PLATFORM_DEBUG_PRINT("\nNull pointer exceptions:\n");
+ PLATFORM_DEBUG_PRINT("packet_pool: null enq : %10u, null deq : %10u\n",null_enq_pkt, null_deq_pkt);
+ PLATFORM_DEBUG_PRINT("ctx_pool : null enq : %10u, null deq : %10u\n",null_enq_ctx, null_deq_ctx);
+ PLATFORM_DEBUG_PRINT("wqe_pool : null enq : %10u, null deq : %10u\n",null_enq_wqe, null_deq_wqe);
+
+ PLATFORM_DEBUG_PRINT("\nReceived Packet Errors on SPI:\n");
+ PLATFORM_DEBUG_PRINT("rcv_pkt_errs: %10u\n",rcv_pkt_errs);
+
+ PLATFORM_DEBUG_PRINT("\nctx/sf allocation failure errors: \n");
+#ifndef CGN_PERF_SCALE_DEBUG
+ PLATFORM_DEBUG_PRINT("Warning: collection of error counts <with timestamp> is disabled.\n");
+ PLATFORM_DEBUG_PRINT("sf alloc errors: %10u, ctx alloc errors: %10u\n",sf_alloc_errs,ctx_alloc_errs);
+#else
+ for(i=0;i<COUNTER_BUFFER_SIZE;i++)
+ PLATFORM_DEBUG_PRINT("<%2d>Timestamp <sec>: %10u, sf errors: %10u, ctx errors: %10u\n",\
+ i,err_cnt_arr[i].timestamp,\
+ err_cnt_arr[i].sf_error_counter, \
+ err_cnt_arr[i].ctx_error_counter);
+#endif
+}
+
+void cnat_db_hash_summary (int argc, unsigned long *argv) {
+
+ show_main_db_hash_chain_len();
+
+ show_user_db_hash_chain_len();
+}
+
+/*
+ * cnat_port_alloc
+ * This function is now deprecated...
+ *
+ */
+#ifdef LB_PORT
+int cnat_port_alloc (cnat_portmap_t *cnat_portmap, u16 *portmap_inuse,
+ int cnat_instance,
+ port_alloc_t atype, port_pair_t ptype,
+ int *index, u32 *ipv4_address, u16 *base_port)
+#else
+int cnat_port_alloc (cnat_portmap_t *cnat_portmap, u16 *portmap_inuse,
+ port_alloc_t atype, port_pair_t ptype,
+ int *index, u32 *ipv4_address, u16 *base_port)
+#endif
+{
+
+ return (0);
+}
+
+/*
+ * cnat_port_free
+ * This function is now deprecated...
+ *
+ */
+#ifdef LB_PORT
+void cnat_port_free (cnat_portmap_t *cnat_portmap, u16 *portmap_inuse,
+ int instance, int index, port_pair_t ptype, u16 base_port)
+#else
+void cnat_port_free (cnat_portmap_t *cnat_portmap, u16 *portmap_inuse,
+ int index, port_pair_t ptype, u16 base_port)
+#endif
+{
+}
+
+void spp_api_cnat_port_allocate_t_handler(spp_api_cnat_port_allocate_t *mp)
+{
+ int i, j, k1, k2;
+ int pm_index;
+ u32 ipv4_address;
+ u16 aport;
+ int rv;
+ char *out1, *out2, *out_f;
+ port_alloc_t pt1, pt2;
+ cnat_portmap_t *pm = 0;
+ u16 *pm_inuse = 0;
+ u32 *firstp =0;
+ u32 nr_ports =0;
+ u32 nodd_ports = 0;
+ u32 neven_ports = 0;
+#ifdef LB_PORT
+ u32 my_instance = 1;
+#endif
+ char out_r[12] = "allocated-r";
+ char out_o[12] = "allocated-o";
+ char out_e[12] = "allocated-e";
+
+
+ /*
+ * this command is run after db create portmap
+ * vrf is hardcode to 1
+ */
+
+ /* Already have a portmap vector for this VRF? */
+ for (i = 0; i < vec_len(cnat_portmap_indices_by_vrf); i++) {
+ if (cnat_portmap_indices_by_vrf[i] == mp->vrf) {
+ pm = cnat_portmaps[i];
+ pm_inuse = cnat_portmaps_inuse[i];
+ goto found_portmaps;
+ }
+ }
+
+ printf("need to run db create portmaps first 0x%d\n",
+ vec_len(cnat_portmap_indices_by_vrf));
+ return;
+
+found_portmaps:
+ nr_ports = mp->nr_ports;
+ nodd_ports = mp->nodd_ports;
+ neven_ports = mp->neven_ports;
+
+ if ((nr_ports + nodd_ports + neven_ports ) > (PORTS_PER_ADDR)) {
+ printf("invalid port# nr_ports %d + odd %d + even %d "
+ "should be less than 200 \n", nr_ports, nodd_ports, neven_ports);
+ return;
+ }
+
+ /*
+ * first port
+ */
+ firstp = nr_ports ? (&nr_ports) : (nodd_ports ? (&nodd_ports) : (&neven_ports));
+ if (!(*firstp)) {
+ printf("invalid port# nr_ports %d odd %d even %d ",
+ nr_ports, nodd_ports, neven_ports);
+ }
+ out_f = nr_ports ? out_r : (nodd_ports ? out_o : out_e);
+
+#ifdef LB_PORT
+ rv = cnat_port_alloc (pm, pm_inuse, my_instance,
+ PORT_ALLOC_ANY, PORT_S_ODD,
+ &pm_index, &ipv4_address, &aport);
+#else
+ rv = cnat_port_alloc (pm, pm_inuse,
+ PORT_ALLOC_ANY, PORT_S_ODD,
+ &pm_index, &ipv4_address, &aport);
+#endif
+
+ if (!rv) {
+ printf("failed-o\n");
+ return;
+ }
+ printf("%s %8d %10x %8d\n", out_f,
+ pm_index, ipv4_address, aport);
+
+ (*firstp)--;
+
+ for (i=0; i < nr_ports; i++) {
+#ifdef LB_PORT
+ rv = cnat_port_alloc (pm, pm_inuse, my_instance,
+ PORT_ALLOC_DIRECTED, PORT_SINGLE,
+ &pm_index, &ipv4_address, &aport);
+#else
+ rv = cnat_port_alloc (pm, pm_inuse,
+ PORT_ALLOC_DIRECTED, PORT_SINGLE,
+ &pm_index, &ipv4_address, &aport);
+#endif
+ if (rv) {
+ printf("%s %8d %10x %8d\n", out_r,
+ pm_index, ipv4_address, aport);
+ } else {
+ printf("%s failed\n", out_r);
+ return;
+ }
+ }
+
+ if (nodd_ports > neven_ports) {
+ k1 = nodd_ports;
+ k2 = neven_ports;
+ pt1 = PORT_S_ODD;
+ pt2 = PORT_S_EVEN;
+ out1 = out_o;
+ out2 = out_e;
+ } else {
+ k1= neven_ports;
+ pt1 = PORT_S_EVEN;
+ k2 = nodd_ports;
+ pt2 = PORT_S_ODD;
+ out1 = out_e;
+ out2 = out_o;
+ }
+
+ j = 0;
+ for (i=0; i < k1; i++) {
+#ifdef LB_PORT
+ rv = cnat_port_alloc (pm, pm_inuse, my_instance,
+ PORT_ALLOC_DIRECTED, pt1,
+ &pm_index, &ipv4_address, &aport);
+#else
+ rv = cnat_port_alloc (pm, pm_inuse,
+ PORT_ALLOC_DIRECTED, pt1,
+ &pm_index, &ipv4_address, &aport);
+#endif
+ if (rv) {
+ printf("%s %8d %10x %8d\n", out1,
+ pm_index, ipv4_address, aport);
+ } else {
+ printf("%s failed\n", out1);
+ return;
+ }
+
+ if (j < k2) {
+#ifdef LB_PORT
+ rv = cnat_port_alloc (pm, pm_inuse, my_instance,
+ PORT_ALLOC_DIRECTED, pt2,
+ &pm_index, &ipv4_address, &aport);
+#else
+ rv = cnat_port_alloc (pm, pm_inuse,
+ PORT_ALLOC_DIRECTED, pt2,
+ &pm_index, &ipv4_address, &aport);
+#endif
+
+ if (rv) {
+ printf("%s %8d %10x %8d\n", out2,
+ pm_index, ipv4_address, aport);
+ j++;
+ } else {
+ printf("%s failed\n", __FUNCTION__);
+ return;
+ }
+ }
+ }
+}
+
+void cnat_db_summary_stats (int argc, unsigned long *argv)
+{
+ spp_api_cnat_show_statistics_summary_req_t mp[50000];
+
+ spp_api_cnat_show_statistics_summary_req_t_handler(&(mp[0]));
+}
+
+void cnat_debug_global_test (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_debug_global_t *mp;
+ spp_api_cnat_v4_config_dummy_t mp1;
+ spp_api_cnat_v4_config_icmp_timeout_t mp2[10];
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEBUG_GLOBAL;
+ mp->debug_flag = argv[0];
+
+ platform_send_msg(mp);
+
+ mp2[0].default_value = 3;
+
+ spp_api_cnat_v4_config_dummy_t_handler(&mp1);
+ spp_api_cnat_v4_config_icmp_timeout_t_handler(&(mp2[0]));
+}
+
+void cnat_debug_inside_test (int argc, unsigned long *argv)
+{
+
+ spp_api_cnat_v4_debug_in2out_private_addr_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEBUG_IN2OUT_PRIVATE_ADDR;
+
+ mp->start_addr = spp_host_to_net_byte_order_32(argv[0]);
+ mp->end_addr = spp_host_to_net_byte_order_32(argv[1]);
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[2]);
+ mp->debug_flag = spp_host_to_net_byte_order_32(argv[3]);
+
+ platform_send_msg(mp);
+}
+
+void cnat_config_ports_user (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_config_port_limit_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_CONFIG_PORT_LIMIT;
+
+ mp->port_limit = spp_host_to_net_byte_order_16(argv[0]);
+
+ platform_send_msg(mp);
+
+}
+
+void cnat_debug_outside_test (int argc, unsigned long *argv)
+{
+
+ spp_api_cnat_v4_debug_out2in_public_addr_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEBUG_OUT2IN_PUBLIC_ADDR;
+
+ mp->start_addr = spp_host_to_net_byte_order_32(argv[0]);
+ mp->end_addr = spp_host_to_net_byte_order_32(argv[1]);
+ mp->o_vrf = spp_host_to_net_byte_order_16(argv[2]);
+ mp->debug_flag = spp_host_to_net_byte_order_32(argv[3]);
+
+ platform_send_msg(mp);
+}
+
+void cnat_debug_udp_dump (int argc, unsigned long *argv)
+{
+
+ spp_api_cnat_p2mp_debug_request_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_P2MP_DEBUG_REQUEST;
+ mp->dump_type =
+ spp_host_to_net_byte_order_16(CNAT_DEBUG_GENERIC_COMMAND_DEBUG_FLAGS);
+
+ if (spp_host_to_net_byte_order_32(argv[0]) == 1) {
+ mp->param[0] = spp_host_to_net_byte_order_32(
+ CNAT_DEBUG_FLAG_UDP_INSIDE_PACKET_DUMP);
+ } else {
+ mp->param[0] = spp_host_to_net_byte_order_32(
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_PACKET_DUMP);
+ }
+ mp->param[1] = spp_host_to_net_byte_order_32(argv[1]);
+
+ platform_send_msg(mp);
+
+
+
+}
+
+void cnat_debug_udp_crc (int argc, unsigned long *argv)
+{
+ spp_api_cnat_p2mp_debug_request_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_P2MP_DEBUG_REQUEST;
+ mp->dump_type =
+ spp_host_to_net_byte_order_16(CNAT_DEBUG_GENERIC_COMMAND_DEBUG_FLAGS);
+
+ if (spp_host_to_net_byte_order_32(argv[0]) == 1) {
+ mp->param[0] = spp_host_to_net_byte_order_32(
+ CNAT_DEBUG_FLAG_UDP_INSIDE_CHECKSUM_MODIFY);
+ } else {
+ mp->param[0] = spp_host_to_net_byte_order_32(
+ CNAT_DEBUG_FLAG_UDP_OUTSIDE_CHECKSUM_MODIFY);
+ }
+ mp->param[1] = spp_host_to_net_byte_order_32(argv[1]);
+
+ platform_send_msg(mp);
+
+}
+
+void cnat_db_allocate_port_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_port_allocate_t *mp;
+
+
+ if (!argc) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ if ( argc < 3) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ if ((argc == 3) && (argv[0] == 0) && (argv[1] == 0) && (argv[2] == 0)) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_PORT_ALLOCATE;
+ mp->nr_ports = argv[0];
+ mp->nodd_ports = argv[1];
+ mp->neven_ports = argv[2];
+ mp->vrf = 1;
+
+ platform_send_msg(mp);
+}
+
+
+void spp_api_cnat_port_clear_t_handler(spp_api_cnat_port_clear_t *mp)
+{
+ u32 i;
+ cnat_portmap_t *pm = 0;
+ u16 *pm_inuse = 0;
+#ifdef LB_PORT
+ u32 my_instance = 1;
+#endif
+
+
+ /*
+ * this command is run after db create port
+ * vrf is hardcode to 1
+ */
+
+ /* Already have a portmap vector for this VRF? */
+ for (i = 0; i < vec_len(cnat_portmap_indices_by_vrf); i++) {
+ if (cnat_portmap_indices_by_vrf[i] == mp->vrf) {
+ pm = cnat_portmaps[i];
+ pm_inuse = cnat_portmaps_inuse[i];
+ goto found_portmaps;
+ }
+ }
+
+ printf("portmap is not created 0x%d\n",
+ vec_len(cnat_portmap_indices_by_vrf));
+ return;
+
+found_portmaps:
+ if (mp->pm_index >= vec_len(pm)) {
+ printf("invalid port_index 0x%d >= 0x%d\n",
+ mp->pm_index, vec_len(pm));
+ return;
+ }
+
+#ifdef LB_PORT
+ cnat_port_free(pm, pm_inuse, my_instance,
+ mp->pm_index, PORT_SINGLE, mp->port);
+#else
+ cnat_port_free(pm, pm_inuse,
+ mp->pm_index, PORT_SINGLE, mp->port);
+#endif
+ printf("\n pm_index %d port %d is deleted\n", mp->pm_index, mp->port);
+}
+
+
+
+void cnat_db_clear_port_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_port_clear_t *mp;
+
+ if (!argc) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ if ( argc < 2 ) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ if (argc > 2) {
+ printf("too many port# defined\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_PORT_CLEAR;
+ mp->pm_index = argv[0];
+ mp->port = argv[1];
+ mp->vrf = 1;
+
+ platform_send_msg(mp);
+}
+
+
+void spp_api_cnat_v4_add_vrf_map_t_handler
+(spp_api_cnat_v4_add_vrf_map_t *mp);
+
+void spp_api_cnat_v4_del_vrf_map_t_handler
+(spp_api_cnat_v4_del_vrf_map_t *mp);
+
+void spp_api_cnat_v4_add_static_port_t_handler
+(spp_api_cnat_v4_add_static_port_t *mp);
+
+void spp_api_cnat_v4_del_static_port_t_handler
+(spp_api_cnat_v4_del_static_port_t *mp);
+
+
+void cnat_db_create_vrfmap_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_add_vrf_map_t *mp;
+
+ if ((argc != 4)) {
+ printf("need right input\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_ADD_VRF_MAP;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ mp->o_vrf = spp_host_to_net_byte_order_16(argv[1]);
+ mp->start_addr[0] = spp_host_to_net_byte_order_32(argv[2]);
+ mp->end_addr[0] = spp_host_to_net_byte_order_32(argv[3]);
+
+ /*
+ * Some hardcoded values for the vrf ids
+ */
+ mp->i_vrf_id = spp_host_to_net_byte_order_32(0x00000100 | mp->i_vrf);
+ mp->o_vrf_id = spp_host_to_net_byte_order_32(0x00000200 | mp->o_vrf);
+
+ platform_send_msg(mp);
+}
+
+
+void cnat_db_delete_vrfmap_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_del_vrf_map_t *mp;
+
+ if (argc != 4) {
+ printf("need right input\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEL_VRF_MAP;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ mp->start_addr[0] = spp_host_to_net_byte_order_32(argv[2]);
+ mp->end_addr[0] = spp_host_to_net_byte_order_32(argv[3]);
+
+ platform_send_msg(mp);
+}
+
+void cnat_db_add_svi_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_config_svi_params_t *mp;
+
+ if (argc != 3) {
+ printf("need right input\n");
+ return;
+ }
+
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_CONFIG_SVI_PARAMS;
+ mp->uidb_index = spp_host_to_net_byte_order_16(argv[1]);
+ mp->svi_ipv4_addr = spp_host_to_net_byte_order_32(argv[2]);
+ platform_send_msg(mp);
+ return;
+}
+
+
+
+void spp_api_cnat_port_create_t_handler(spp_api_cnat_port_create_t *mp)
+{
+ int i, j, k1, k2;
+ int my_index;
+ u32 ipv4_address;
+ u16 aport;
+ u32 pm_len =0;
+ cnat_errno_t rv;
+ u16 i_vrf;
+ char *out1, *out2, *out_f;
+ port_alloc_t pt1, pt2;
+ cnat_vrfmap_t *my_vrfmap;
+ cnat_portmap_v2_t *pm = 0;
+ u32 *firstp =0;
+ u32 nr_ports =0;
+ u32 nodd_ports = 0;
+ u32 neven_ports = 0;
+#ifdef LB_PORT
+ u32 my_instance = 1;
+#endif
+ char out_r[12] = "allocated-r";
+ char out_o[12] = "allocated-o";
+ char out_e[12] = "allocated-e";
+#ifndef NO_BULK_LOGGING
+ int nfv9_log_req;
+#endif
+
+ nr_ports = mp->nr_ports;
+ nodd_ports = mp->nodd_ports;
+ neven_ports = mp->neven_ports;
+ i_vrf = mp->vrf;
+
+ /*
+ * this command is run after db create vrfmap
+ * or using vrf id in init function
+ */
+ /* Already have a portmap vector for this VRF? */
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if ((my_vrfmap->status == S_RUN) &&
+ (my_vrfmap->i_vrf == i_vrf)) {
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+ if (pm_len) {
+ goto found_portmaps;
+ }
+ }
+ }));
+
+ printf("need to run db create vrfmaps first for this vrf0x%d\n", pm_len);
+ return;
+
+found_portmaps:
+
+ if ((nr_ports + nodd_ports + neven_ports ) > (PORTS_PER_ADDR)) {
+ printf("invalid port# nr_ports %d + odd %d + even %d "
+ "should be less than 200 \n", nr_ports, nodd_ports, neven_ports);
+ return;
+ }
+
+ /*
+ * first port
+ */
+ firstp = nr_ports ? (&nr_ports) : (nodd_ports ? (&nodd_ports) : (&neven_ports));
+ if (!(*firstp)) {
+ printf("invalid port# nr_ports %d odd %d even %d ",
+ nr_ports, nodd_ports, neven_ports);
+ }
+ out_f = nr_ports ? out_r : (nodd_ports ? out_o : out_e);
+
+ rv = cnat_dynamic_port_alloc_v2 (pm, PORT_ALLOC_ANY, PORT_S_ODD,
+ &my_index, &ipv4_address, &aport,
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(my_vrfmap->rseed_ip)
+ );
+
+ if (rv != CNAT_SUCCESS) {
+ printf("failed-o\n");
+ return;
+ }
+ printf("%s %8d %10x %8d\n", out_f,
+ my_index, ipv4_address, aport);
+
+ (*firstp)--;
+
+ for (i=0; i < nr_ports; i++) {
+ rv = cnat_dynamic_port_alloc_v2 (pm, PORT_ALLOC_DIRECTED, PORT_SINGLE,
+ &my_index, &ipv4_address, &aport,
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(my_vrfmap->rseed_ip)
+ );
+
+ if (rv == CNAT_SUCCESS) {
+ printf("%s %8d %10x %8d\n", out_r,
+ my_index, ipv4_address, aport);
+ } else {
+ printf("%s failed\n", __FUNCTION__);
+ return;
+ }
+ }
+
+ if (nodd_ports > neven_ports) {
+ k1 = nodd_ports;
+ k2 = neven_ports;
+ pt1 = PORT_S_ODD;
+ pt2 = PORT_S_EVEN;
+ out1 = out_o;
+ out2 = out_e;
+ } else {
+ k1= neven_ports;
+ pt1 = PORT_S_EVEN;
+ k2 = nodd_ports;
+ pt2 = PORT_S_ODD;
+ out1 = out_e;
+ out2 = out_o;
+ }
+
+ j = 0;
+ for (i=0; i < k1; i++) {
+ rv = cnat_dynamic_port_alloc_v2 (pm, PORT_ALLOC_DIRECTED, pt1,
+ &my_index, &ipv4_address, &aport,
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(my_vrfmap->rseed_ip)
+ );
+
+ if (rv == CNAT_SUCCESS) {
+ printf("%s %8d %10x %8d\n", out1,
+ my_index, ipv4_address, aport);
+ } else {
+ printf("%s failed\n", __FUNCTION__);
+ return;
+ }
+
+ if (j < k2) {
+ rv = cnat_dynamic_port_alloc_v2 (pm, PORT_ALLOC_DIRECTED, pt2,
+ &my_index, &ipv4_address, &aport,
+ cnat_static_port_range
+#ifndef NO_BULK_LOGGING
+ , BULKSIZE_FROM_VRFMAP(my_vrfmap),
+ &nfv9_log_req
+#endif
+ , 0,
+ &(my_vrfmap->rseed_ip)
+ );
+
+ if (rv == CNAT_SUCCESS) {
+ printf("%s %8d %10x %8d\n", out2,
+ my_index, ipv4_address, aport);
+ j++;
+ } else {
+ printf("%s failed\n", __FUNCTION__);
+ return;
+ return;
+ }
+ }
+ }
+}
+
+
+void cnat_db_create_port_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_port_create_t *mp;
+
+ if (argc != 4) {
+ printf("no proper input defined\n");
+ return;
+ }
+
+ if ((argv[0] == 0) && (argv[1] == 0) && (argv[2] == 0)) {
+ printf("no port# defined\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_PORT_CREATE;
+ mp->nr_ports = argv[0];
+ mp->nodd_ports = argv[1];
+ mp->neven_ports = argv[2];
+ mp->vrf = argv[3];
+
+ platform_send_msg(mp);
+}
+
+void spp_api_cnat_port_delete_t_handler(spp_api_cnat_port_delete_t *mp)
+{
+ u32 pm_len;
+ cnat_vrfmap_t *my_vrfmap;
+ cnat_portmap_v2_t *pm = 0;
+
+ u32 my_index, my_port;
+ u16 i_vrf;
+#ifdef LB_PORT
+ u32 my_instance = 1;
+#endif
+
+ my_index = mp->pm_index;
+ my_port = mp->port;
+ i_vrf = mp->vrf;
+
+ /*
+ * this command is run after db create port
+ */
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->i_vrf == i_vrf) {
+ pm = my_vrfmap->portmap_list;
+ pm_len = vec_len(pm);
+ if (pm_len) {
+ goto found_portmaps;
+ }
+ }
+ }));
+
+ printf("portmap is not created 0x%d\n",
+ vec_len(cnat_portmap_indices_by_vrf));
+ return;
+
+found_portmaps:
+ if (my_index >= pm_len) {
+ printf("invalid port_index 0x%d >= 0x%d\n",
+ my_index, pm_len);
+ return;
+ }
+
+#ifdef LB_PORT
+ cnat_port_free_v2(pm, my_instance,
+ my_index, PORT_SINGLE, mp->port,cnat_static_port_range);
+#else
+ cnat_port_free_v2(pm, my_index, PORT_SINGLE, mp->port,cnat_static_port_range);
+#endif
+ printf("\n pm_index %d port %d is deleted\n", mp->pm_index, mp->port);
+}
+
+void cnat_db_delete_port_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_port_clear_t *mp;
+
+ if (argc != 3) {
+ printf("no proper input defined\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+
+ mp->_spp_msg_id = SPP_API_CNAT_PORT_DELETE;
+ mp->pm_index = argv[0];
+ mp->port = argv[1];
+ mp->vrf = argv[2];
+ platform_send_msg(mp);
+}
+
+void cnat_db_create_static_fwd_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_add_static_port_t *mp;
+
+ if (argc != 4) {
+ printf("need right input\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_ADD_STATIC_PORT;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ mp->i_ip = spp_host_to_net_byte_order_32(argv[1]);
+ mp->i_port = spp_host_to_net_byte_order_16(argv[2]);
+ mp->proto = argv[3];
+
+ platform_send_msg(mp);
+ return;
+}
+
+void cnat_db_create_static_fwd_stby_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_add_static_port_t *mp;
+
+ if (argc != 7) {
+ printf("need right input\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_ADD_STATIC_PORT;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ mp->i_ip = spp_host_to_net_byte_order_32(argv[1]);
+ mp->i_port = spp_host_to_net_byte_order_16(argv[2]);
+ mp->proto = argv[3];
+ mp->o_vrf_id = spp_host_to_net_byte_order_32(argv[4]);
+ mp->o_ip = spp_host_to_net_byte_order_32(argv[5]);
+ mp->o_port = spp_host_to_net_byte_order_16(argv[6]);
+
+printf("\ni_vrf %d, ip 0x%x, port %d, o_ip, port %d", mp->i_vrf, mp->i_ip, mp->i_port, mp->o_ip, mp->o_port);
+
+ platform_send_msg(mp);
+ return;
+}
+
+void cnat_db_delete_static_fwd_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_del_static_port_t *mp;
+
+ if (argc != 3) {
+ printf("need right input\n");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEL_STATIC_PORT;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ mp->i_ip = spp_host_to_net_byte_order_32(argv[1]);
+ mp->i_port = spp_host_to_net_byte_order_16(argv[2]);
+
+ platform_send_msg(mp);
+ return;
+}
+
+void cnat_nfv9_create_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_config_nfv9_logging_t *mp;
+
+ if (argc < 3) {
+ printf("nfv9 create i_vrf ip_addr port [refresh_rate] [timeout] [mtu]");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_CONFIG_NFV9_LOGGING;
+ mp->enable = 1;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+
+ mp->ipv4_address = spp_host_to_net_byte_order_32(argv[1]);
+ mp->port = spp_host_to_net_byte_order_16(argv[2]);
+
+ if (argc > 3) {
+ mp->refresh_rate = spp_host_to_net_byte_order_16(argv[3]);
+ mp->timeout_rate = spp_host_to_net_byte_order_16(argv[4]);
+ mp->path_mtu = spp_host_to_net_byte_order_16(argv[5]);
+ } else {
+ mp->refresh_rate = spp_host_to_net_byte_order_16(1000);
+ mp->timeout_rate = spp_host_to_net_byte_order_16(30);
+ mp->path_mtu = spp_host_to_net_byte_order_16(1500);
+ }
+ platform_send_msg(mp);
+}
+
+void cnat_delete_cgn (int argc, unsigned long *argv)
+{
+ void *mp_send;
+ spp_api_cnat_del_cgn_t *mp;
+ u32 mp_size;
+
+ mp_size = sizeof(spp_api_cnat_del_cgn_t);
+
+ mp = spp_msg_api_alloc(mp_size);
+ memset(mp, 0, mp_size);
+
+ mp->_spp_msg_id = SPP_API_CNAT_DEL_CGN;
+
+ mp_send = mp;
+
+ platform_send_msg(mp);
+}
+
+void cnat_debug_global_all (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_debug_global_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEBUG_GLOBAL;
+ mp->debug_flag = CNAT_DEBUG_GLOBAL_ALL;
+
+ platform_send_msg(mp);
+}
+
+void cnat_debug_global_none (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_debug_global_t *mp;
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_DEBUG_GLOBAL;
+ mp->debug_flag = CNAT_DEBUG_NONE;
+
+ platform_send_msg(mp);
+}
+
+
+void cnat_bulk_cmd (int argc, unsigned long *argv)
+{
+ void *mp_send;
+
+ if (argc < 1) {
+ printf("\nargc = %d", argc);
+ printf("\n1. bulk cmd [0=static-port, 1=bulk_vrf, 2=policy_knob]");
+ return;
+ }
+
+
+ switch (argv[0]) {
+ case 0:
+ {
+ spp_api_cnat_v4_bulk_add_delete_static_port_t *mp;
+ spp_api_cnat_v4_add_static_port_t *mp_sp;
+ u32 mp_size =
+ sizeof(spp_api_cnat_v4_bulk_add_delete_static_port_t) +
+ (sizeof(spp_api_cnat_v4_add_static_port_t))*2;
+
+ mp = spp_msg_api_alloc(mp_size);
+ memset(mp, 0, mp_size);
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_BULK_ADD_DELETE_STATIC_PORT;
+
+ mp->num_static_port_entries = spp_host_to_net_byte_order_32(3);
+
+ mp_sp = (spp_api_cnat_v4_add_static_port_t *) &(mp->pad2);
+
+ mp_sp->_spp_msg_id = spp_host_to_net_byte_order_16(
+ SPP_API_CNAT_V4_ADD_STATIC_PORT);
+ mp_sp->proto = 2;
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->i_ip = spp_host_to_net_byte_order_32(0x11111111);
+ mp_sp->i_port = spp_host_to_net_byte_order_16(0x7777);
+
+ mp_sp++;
+
+
+ mp_sp->_spp_msg_id = spp_host_to_net_byte_order_16(
+ SPP_API_CNAT_V4_ADD_STATIC_PORT);
+ mp_sp->proto = 1;
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->i_ip = spp_host_to_net_byte_order_32(0x22222222);
+ mp_sp->i_port = spp_host_to_net_byte_order_16(0x6666);
+
+ mp_sp++;
+
+
+ mp_sp->_spp_msg_id = spp_host_to_net_byte_order_16(
+ SPP_API_CNAT_V4_ADD_STATIC_PORT);
+ mp_sp->proto = 1;
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->i_ip = spp_host_to_net_byte_order_32(0x33333333);
+ mp_sp->i_port = spp_host_to_net_byte_order_16(0x5555);
+
+ mp_send = mp;
+
+ }
+ break;
+
+ case 1:
+ {
+ spp_api_cnat_v4_bulk_vrf_map_t *mp;
+ spp_api_cnat_v4_single_vrf_map_req *mp_sp;
+
+ u32 mp_size = sizeof(spp_api_cnat_v4_bulk_vrf_map_t) +
+ (sizeof(spp_api_cnat_v4_single_vrf_map_req))*2;
+
+ mp = spp_msg_api_alloc(mp_size);
+ memset(mp, 0, mp_size);
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_BULK_VRF_MAP;
+
+ mp->num_vrfmap_entries = spp_host_to_net_byte_order_32(3);
+
+ mp_sp = (spp_api_cnat_v4_single_vrf_map_req *)
+ &(mp->vrf_policy_enable);
+
+ mp_sp->i_vrf_id = spp_host_to_net_byte_order_32(0xe0000001);
+ mp_sp->o_vrf_id = spp_host_to_net_byte_order_32(0xe0000000);
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->o_vrf = spp_host_to_net_byte_order_16(0x0);
+ mp_sp->start_addr = spp_host_to_net_byte_order_32(0x11111100);
+ mp_sp->end_addr = spp_host_to_net_byte_order_32(0x111111ff);
+ mp_sp->vrf_policy_enable = spp_host_to_net_byte_order_16(0x3);
+ mp_sp->tcp_mss_value = spp_host_to_net_byte_order_16(0x111);
+ mp_sp->vrf_nfv9_logging_ipv4_address = spp_host_to_net_byte_order_32(0x11000001);
+ mp_sp->vrf_nfv9_logging_udp_port = spp_host_to_net_byte_order_16(0x1001);
+ mp_sp->vrf_nfv9_refresh_rate = spp_host_to_net_byte_order_16(0x100);
+ mp_sp->vrf_nfv9_timeout_rate = spp_host_to_net_byte_order_16(0x10);
+ mp_sp->vrf_nfv9_path_mtu = spp_host_to_net_byte_order_16(0x100);
+
+ mp_sp++;
+
+ mp_sp->i_vrf_id = spp_host_to_net_byte_order_32(0xe0000002);
+ mp_sp->o_vrf_id = spp_host_to_net_byte_order_32(0xe0000000);
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x2);
+ mp_sp->o_vrf = spp_host_to_net_byte_order_16(0x0);
+ mp_sp->start_addr = spp_host_to_net_byte_order_32(0x22220000);
+ mp_sp->end_addr = spp_host_to_net_byte_order_32(0x2222ffff);
+ mp_sp->vrf_policy_enable = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->tcp_mss_value = spp_host_to_net_byte_order_16(0x222);
+ mp_sp->vrf_nfv9_logging_ipv4_address = spp_host_to_net_byte_order_32(0x22000002);
+ mp_sp->vrf_nfv9_logging_udp_port = spp_host_to_net_byte_order_16(0x2002);
+ mp_sp->vrf_nfv9_refresh_rate = spp_host_to_net_byte_order_16(0x200);
+ mp_sp->vrf_nfv9_timeout_rate = spp_host_to_net_byte_order_16(0x20);
+ mp_sp->vrf_nfv9_path_mtu = spp_host_to_net_byte_order_16(0x200);
+
+ mp_sp++;
+
+ mp_sp->i_vrf_id = spp_host_to_net_byte_order_32(0xe0000003);
+ mp_sp->o_vrf_id = spp_host_to_net_byte_order_32(0xe0000007);
+ mp_sp->i_vrf = spp_host_to_net_byte_order_16(0x3);
+ mp_sp->o_vrf = spp_host_to_net_byte_order_16(0x7);
+ mp_sp->start_addr = spp_host_to_net_byte_order_32(0x33333000);
+ mp_sp->end_addr = spp_host_to_net_byte_order_32(0x33333fff);
+ mp_sp->vrf_policy_enable = spp_host_to_net_byte_order_16(0x1);
+ mp_sp->tcp_mss_value = spp_host_to_net_byte_order_16(0x333);
+ mp_sp->vrf_nfv9_logging_ipv4_address = spp_host_to_net_byte_order_32(0x33000003);
+ mp_sp->vrf_nfv9_logging_udp_port = spp_host_to_net_byte_order_16(0x3003);
+ mp_sp->vrf_nfv9_refresh_rate = spp_host_to_net_byte_order_16(0x300);
+ mp_sp->vrf_nfv9_timeout_rate = spp_host_to_net_byte_order_16(0x30);
+ mp_sp->vrf_nfv9_path_mtu = spp_host_to_net_byte_order_16(0x300);
+
+ mp_send = mp;
+ }
+ break;
+
+ case 2:
+ {
+ spp_api_cnat_v4_bulk_policy_knob_t *mp;
+
+ u32 mp_size =
+ sizeof(spp_api_cnat_v4_bulk_policy_knob_t) +
+ (sizeof(spp_api_cnat_v4_single_vrf_map_req))*2;
+
+ mp = spp_msg_api_alloc(mp_size);
+ memset(mp, 0, mp_size);
+
+ mp->_spp_msg_id = SPP_API_CNAT_V4_BULK_POLICY_KNOB;
+
+ mp->port_limit = spp_host_to_net_byte_order_16(345);
+ mp->icmp_timeout = spp_host_to_net_byte_order_16(300);
+ mp->udp_init_timeout = spp_host_to_net_byte_order_16(175);
+ mp->udp_act_timeout = spp_host_to_net_byte_order_16(133);
+ mp->tcp_init_timeout = spp_host_to_net_byte_order_16(222);
+ mp->tcp_act_timeout = spp_host_to_net_byte_order_16(2345);
+
+ mp->nat_policy_enable = spp_host_to_net_byte_order_32(0x7);
+
+ mp->global_nfv9_logging_ipv4_address = spp_host_to_net_byte_order_32(0x77777777);
+ mp->global_nfv9_logging_udp_port = spp_host_to_net_byte_order_16(0x7007);
+ mp->global_nfv9_refresh_rate = spp_host_to_net_byte_order_16(0x700);
+ mp->global_nfv9_timeout_rate = spp_host_to_net_byte_order_16(0x70);
+ mp->global_nfv9_path_mtu = spp_host_to_net_byte_order_16(0x700);
+
+ mp_send = mp;
+ }
+ break;
+
+
+ default:
+ printf("\nargv[2] = %d", argv[2]);
+ printf("\n2. bulk cmd [0=static-port, 1=bulk_vrf, 2=policy_knob+bulk_vrf]");
+ return;
+
+ }
+ platform_send_msg(mp_send);
+}
+
+void cnat_nfv9_delete_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_v4_config_nfv9_logging_t *mp;
+
+ if (argc != 1) {
+ printf("nfv9 delete i_vrf ");
+ return;
+ }
+
+ mp = spp_msg_api_alloc (sizeof (*mp));
+ memset(mp, 0, sizeof (*mp));
+ mp->_spp_msg_id = SPP_API_CNAT_V4_CONFIG_NFV9_LOGGING;
+ mp->enable = 0;
+ mp->i_vrf = spp_host_to_net_byte_order_16(argv[0]);
+ platform_send_msg(mp);
+}
+
+void cnat_generic_cmd (int argc, unsigned long *argv)
+{
+ spp_api_cnat_generic_command_request_t *mp;
+
+ if (argc != 9) {
+ printf("generic command core type p1 p2 p3 p4 p5 p6 p7 ");
+ return;
+ }
+
+ /*
+ * Allocate a large buffer for message req and resp structure
+ */
+ mp = spp_msg_api_alloc (MAX_DEBUG_BUFFER_SIZE);
+ memset(mp, 0, MAX_DEBUG_BUFFER_SIZE);
+ mp->_spp_msg_id = SPP_API_CNAT_GENERIC_COMMAND_REQUEST;
+ mp->core_num = argv[0];
+ mp->params[0] = spp_host_to_net_byte_order_32(argv[1]);
+ mp->params[1] = spp_host_to_net_byte_order_32(argv[2]);
+ mp->params[2] = spp_host_to_net_byte_order_32(argv[3]);
+ mp->params[3] = spp_host_to_net_byte_order_32(argv[4]);
+ mp->params[4] = spp_host_to_net_byte_order_32(argv[5]);
+ mp->params[5] = spp_host_to_net_byte_order_32(argv[6]);
+ mp->params[6] = spp_host_to_net_byte_order_32(argv[7]);
+ mp->params[7] = spp_host_to_net_byte_order_32(argv[8]);
+ platform_send_msg(mp);
+}
+
+u32 icmp_sent_timestamps; /* 32 KB array per core */
+u8 v4_pkt_count = 0;
+
+cnat_icmp_msg_t icmp_msg_gen_allowed ()
+{
+#ifdef DISABLE_ICMP_THROTTLE_FOR_DEBUG_PURPOSE
+ return CNAT_ICMP_MSG;
+#else
+ u32 current_timestamp;
+ spp_node_main_vector_t *nmv;
+ u32 updated_timestamp;
+
+ v4_pkt_count ++;
+
+ nmv = spp_get_node_main_vectorized_inline();
+
+ current_timestamp = nmv->ticks / nmv->ticks_per_second;
+
+ PLATFORM_UPDATE_TIMESTAMP
+ if (PREDICT_FALSE(icmp_sent_timestamps != updated_timestamp)) {
+ v4_pkt_count = 1;
+ /* update timestamp */
+ icmp_sent_timestamps = updated_timestamp;
+ }
+ if (PREDICT_TRUE(v4_pkt_count <= cnat_main_db_icmp_rate_limit_core)) {
+ return CNAT_ICMP_MSG;
+ } else {
+ return CNAT_NO_ICMP_MSG;
+ }
+#endif
+}
+
+u32 v6_icmp_sent_timestamps; /* 32 KB array per core */
+u8 v6_pkt_count = 0;
+
+cnat_icmp_msg_t v6_icmp_msg_gen_allowed ()
+{
+#ifdef DISABLE_ICMP_THROTTLE_FOR_DEBUG_PURPOSE
+ return CNAT_ICMP_MSG;
+#else
+ u32 current_timestamp;
+ spp_node_main_vector_t *nmv;
+ u32 updated_timestamp;
+
+ nmv = spp_get_node_main_vectorized_inline();
+
+ current_timestamp = nmv->ticks / nmv->ticks_per_second;
+ PLATFORM_UPDATE_TIMESTAMP
+ v6_pkt_count ++;
+
+ if (PREDICT_FALSE(v6_icmp_sent_timestamps != updated_timestamp)) {
+ v6_pkt_count = 1;
+ /* update timestamp */
+ v6_icmp_sent_timestamps = updated_timestamp;
+ }
+ if (PREDICT_TRUE(v6_pkt_count <= cnat_main_db_icmp_rate_limit_core)) {
+ return CNAT_ICMP_MSG;
+ } else {
+ return CNAT_NO_ICMP_MSG;
+ }
+#endif
+}
+
+u32 v4_udp_crc_zero_timestamps;
+u32 v4_udp_crc_zero_pkt_count = 0;
+int v4_crc_zero_udp_allowed ()
+{
+ PLATFORM_V4_CRC_ZERO_UDP_ALLOWED
+ /* Currently not supported for Brahmos. we need to take care of this */
+ spp_node_main_vector_t *nmv;
+ u32 hash_value, current_timestamp;
+
+ nmv = spp_get_node_main_vectorized_inline();
+
+ current_timestamp = nmv->ticks / nmv->ticks_per_second;
+ v4_udp_crc_zero_pkt_count++;
+ if (PREDICT_FALSE(v4_udp_crc_zero_timestamps != current_timestamp)) {
+ v4_udp_crc_zero_pkt_count = 1;
+ v4_udp_crc_zero_timestamps = current_timestamp;
+ }
+ if (PREDICT_TRUE(v4_udp_crc_zero_pkt_count <=
+ crc_zero_udp_rate_limit_core)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * ipv4_decr_ttl_n_calc_csum()
+ * - It decrements the TTL and calculates the incremental IPv4 checksum
+ */
+
+ALWAYS_INLINE(
+void ipv4_decr_ttl_n_calc_csum(ipv4_header *ipv4))
+{
+ u32 checksum;
+ u16 old;
+
+ old = ntohs(*(u16 *)&ipv4->ttl);
+
+ /* Decrement TTL */
+ ipv4->ttl--;
+
+ /* Calculate incremental checksum */
+ checksum = old + (~ntohs(*(u16 *)&ipv4->ttl) & 0xFFFF);
+ checksum += ntohs(ipv4->checksum);
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ ipv4->checksum = htons(checksum + (checksum >> 16));
+}
+
+ALWAYS_INLINE(
+void calc_ipv4_checksum (ipv4_header *ipv4))
+{
+ u16 *data = (u16 *) ipv4;
+ u32 checksum = 0;
+
+ checksum = crc_calc(ipv4);
+
+ /* Now produce the 1's complement */
+ ipv4->checksum = spp_host_to_net_byte_order_16(((u16) (~(u16)checksum)));
+}
+
+ALWAYS_INLINE(
+void calc_v4_icmp_checksum (icmp_v4_t *icmp, int ipv4_payload_size))
+{
+ u16 *data = (u16 *) icmp;
+ int num_hwords = (ipv4_payload_size)/2;
+ u32 checksum = 0;
+
+ icmp->checksum = 0;
+ if (PREDICT_FALSE((ipv4_payload_size%2) != 0)) {
+ num_hwords += 1;
+ /* Append 0's in the last octet */
+ *((u8 *)data + ipv4_payload_size) = 0;
+ }
+ while (num_hwords) {
+ checksum += (u32)spp_net_to_host_byte_order_16(data++);
+ num_hwords--;
+ }
+
+ /* Add in the carry of the original sum */
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ /* Add in the carry of the final sum */
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ /* Now produce the 1's complement */
+ icmp->checksum = spp_host_to_net_byte_order_16(((u16) (~(u16)checksum)));
+}
+
+ALWAYS_INLINE(
+void calc_v6_icmp_checksum (ipv6_header_t *ipv6, u16 ip_payload_size))
+{
+ u16 *data;
+ u16 *data1;
+ int i;
+ icmp_v6_t *icmp;
+ int num_hwords = (ip_payload_size)/2;
+ u32 checksum = 0;
+ pseudo_v6_header_t pseudo_header;
+
+ icmp = (icmp_v6_t *) ((u8 *)ipv6 + IPV6_HDR_LEN);
+ data = (u16 *) icmp;
+ icmp->checksum = 0;
+
+#if 1
+ if (PREDICT_FALSE((ip_payload_size%2) != 0)) {
+ num_hwords += 1;
+ /* Append 0's in the last octet */
+ *((u8 *)data + ip_payload_size) = 0;
+ }
+#endif
+
+ /* construct the pseudo header */
+
+ pseudo_header.src_addr[0] = ipv6->src_addr[0];
+ pseudo_header.src_addr[1] = ipv6->src_addr[1];
+ pseudo_header.src_addr[2] = ipv6->src_addr[2];
+ pseudo_header.src_addr[3] = ipv6->src_addr[3];
+ pseudo_header.dst_addr[0] = ipv6->dst_addr[0];
+ pseudo_header.dst_addr[1] = ipv6->dst_addr[1];
+ pseudo_header.dst_addr[2] = ipv6->dst_addr[2];
+ pseudo_header.dst_addr[3] = ipv6->dst_addr[3];
+ pseudo_header.payload_length = spp_host_to_net_byte_order_16(ip_payload_size);
+ pseudo_header.next_header = spp_host_to_net_byte_order_16(ipv6->next_header);
+
+ data1 = (u16 *) &pseudo_header;
+
+ /* sizeof(pseudo_v6_header_t) = 36 */
+ for (i = 0; i < 18; i++) {
+ checksum += (u32)spp_net_to_host_byte_order_16(data1++);
+ }
+
+checksum_calc:
+
+ if (PREDICT_TRUE(num_hwords)) {
+ checksum += (u32)spp_net_to_host_byte_order_16(data);
+ num_hwords--;
+ data++;
+ goto checksum_calc;
+ }
+
+ /* Add in the carry of the original sum */
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ /* Add in the carry of the final sum */
+ checksum = (checksum & 0xFFFF) + (checksum >> 16);
+ /* Now produce the 1's complement */
+ icmp->checksum = spp_host_to_net_byte_order_16(((u16) (~(u16)checksum)));
+}
+
+void icmp_error_generate_v6 (spp_ctx_t *ctx, u8 icmp_type,
+ u8 icmp_code, u16 uidb_index) {
+
+ u16 ip_hdr_len, ip_payload_size;
+ u32 *src_p, * dst_p;
+ icmp_v6_t *icmp;
+ int i;
+ ipv6_header_t *ip_old, *ip_new;
+ u16 icmp_payload_len;
+
+ /*
+ * As per RFC 2463, we limit the maximum size of generated ICMPv6 message to * 1280. And hence if the packet is bigger than 1280, then it needs to be
+ * truncated. Also, if the packet had multiple chained buffers, we need to
+ * free all chained buffers, except the first one.
+ */
+ free_all_but_first_chained_buffers(ctx);
+
+ ip_hdr_len = IPV6_HDR_LEN;
+ /* offset to ip payload */
+
+ ip_old = (ipv6_header_t *)PLATFORM_CTX_CURRENT_HDR;
+ ip_new = (ipv6_header_t *) ((u8 *) PLATFORM_CTX_CURRENT_HDR - ICMPV6_ERR_SIZE);
+ icmp = (icmp_v6_t*) ( (u8*)ip_new + ip_hdr_len);
+
+ icmp_payload_len = ip_hdr_len +
+ spp_net_to_host_byte_order_16(&(ip_old->payload_length)) ;
+
+ ip_payload_size = ICMPV6_HDR_SIZE + icmp_payload_len;
+ /*
+ * There is no easy way to predict this case as the probablity that the IPv6
+ * pkt is big depends on the type of traffic. Let us optimize the big
+ * pkt case as it involves more processing
+ *
+ * If the pkt size exceeds IPV6_MIN_PATH_MTU truncate it to IPV6_MIN_PATH_MTU
+ */
+ if (PREDICT_TRUE((ip_payload_size + ip_hdr_len) > IPV6_MIN_PATH_MTU)) {
+ ip_payload_size = IPV6_MIN_PATH_MTU - ip_hdr_len;
+ }
+
+ /* Following ICMP op has to be after ip header being copied */
+ icmp->type = icmp_type;
+ icmp->code = icmp_code;
+
+ ip_new->version_trafficclass_flowlabel = spp_host_to_net_byte_order_32(
+ VERSION_TRAFFICCLASS_FLOWLABEL);
+ ip_new->payload_length = spp_host_to_net_byte_order_16(ip_payload_size);
+ ip_new->next_header = IPV6_PROTO_ICMPV6;
+ ip_new->hop_limit = 64;
+ ip_new->dst_addr[0] = ip_old->src_addr[0];
+ ip_new->dst_addr[1] = ip_old->src_addr[1];
+ ip_new->dst_addr[2] = ip_old->src_addr[2];
+ ip_new->dst_addr[3] = ip_old->src_addr[3];
+
+ ip_new->src_addr[0] =
+ spp_host_to_net_byte_order_32(svi_params_array[uidb_index].ipv6_addr[0]);
+ ip_new->src_addr[1] =
+ spp_host_to_net_byte_order_32(svi_params_array[uidb_index].ipv6_addr[1]);
+ ip_new->src_addr[2] =
+ spp_host_to_net_byte_order_32(svi_params_array[uidb_index].ipv6_addr[2]);
+ ip_new->src_addr[3] =
+ spp_host_to_net_byte_order_32(svi_params_array[uidb_index].ipv6_addr[3]);
+ /* calc checksum for icmp */
+
+ calc_v6_icmp_checksum(ip_new, ip_payload_size);
+#if 0
+ printf("Flow = 0x%x\n", ip_new->version_trafficclass_flowlabel);
+ printf("Hoplimit = 0x%x\n", ip_new->hop_limit);
+ printf("Length= 0x%x\n", ip_new->payload_length);
+ printf("Next header = 0x%x\n", ip_new->next_header);
+ printf("Src add0 = 0x%x\n", ip_new->src_addr[0]);
+ printf("Src add1 = 0x%x\n", ip_new->src_addr[1]);
+ printf("Src add2 = 0x%x\n", ip_new->src_addr[2]);
+ printf("Src add3 = 0x%x\n", ip_new->src_addr[3]);
+ printf("Dst add0 = 0x%x\n", ip_new->dst_addr[0]);
+ printf("Dst add1 = 0x%x\n", ip_new->dst_addr[1]);
+ printf("Dst add2 = 0x%x\n", ip_new->dst_addr[2]);
+ printf("Dst add3 = 0x%x\n", ip_new->dst_addr[3]);
+ printf("Icmp type = 0x%x\n", icmp->type);
+ printf("Icmp code = 0x%x\n", icmp->code);
+
+ printf("\n\nICMP packet:\n");
+ for (i = 0; i < 10; i ++) {
+ printf("0x%x " , *((u8 *)icmp + i));
+ if ((i%16) == 15) {
+ printf("\n");
+ }
+ }
+#endif
+
+ ctx->current_header -= ICMPV6_ERR_SIZE;
+ ctx->current_length = ip_payload_size + ip_hdr_len;
+ PLATFORM_CNAT_SET_TX_VRF(ctx,uidb_index);
+}
+
+void icmp_error_generate_v2 (ipv4_header *ip, u8 icmp_type,
+ u8 icmp_code, u16 mtu, u32 src_ip)
+{
+
+ u16 ip_hdr_len, ip_payload_size;
+ u32 *src_p, * dst_p;
+ icmp_v4_t *icmp;
+
+ ip_hdr_len = (ip->version_hdr_len_words & 0xf) << 2; /* offset to ip payload */
+ icmp = (icmp_v4_t*) ( (u8*)ip + ip_hdr_len);
+ ip_payload_size = sizeof(icmp_v4_t) + ip_hdr_len +
+ ICMP_UNREACHABLE_IP_PAYLOAD_SIZE;
+
+ src_p = (u32*)
+ ((u8*)ip + ip_hdr_len + ICMP_UNREACHABLE_IP_PAYLOAD_SIZE - 4);
+ dst_p = (u32*) ((u8*)src_p + sizeof(ipv4_header) +
+ sizeof(icmp_v4_t));
+
+ while(src_p >= (u32*)ip) *dst_p-- = *src_p--;
+
+ /* Following ICMP op has to be after ip header being copied */
+ icmp->type = icmp_type;
+ icmp->code = icmp_code;
+ icmp->identifier = 0;
+ icmp->sequence = 0;
+ if(PREDICT_FALSE(mtu != 0)) {
+ icmp->sequence = spp_host_to_net_byte_order_16(mtu);
+ }
+
+
+ /* build icmp header, keep original tos, identification values */
+ ip->version_hdr_len_words = 0x45;
+ ip->total_len_bytes = sizeof(ipv4_header) + ip_payload_size;
+ ip->total_len_bytes = spp_host_to_net_byte_order_16(ip->total_len_bytes);
+ ip->frag_flags_offset = 0;
+ ip->ttl = 64;
+ ip->protocol = ICMP_PROT;
+ ip->checksum = 0;
+ ip->dest_addr = ip->src_addr;
+ ip->src_addr = spp_host_to_net_byte_order_32(src_ip);
+
+ /* calc checksum for ip and icmp */
+
+ calc_ipv4_checksum(ip);
+ calc_v4_icmp_checksum( (icmp_v4_t *) ((u8*) ip + sizeof(ipv4_header)),
+ ip_payload_size);
+}
+
+void icmp_error_generate (ipv4_header *ip, u8 icmp_type,
+ u8 icmp_code, u16 uidb_index) {
+
+ u16 ip_hdr_len, ip_payload_size;
+ u32 *src_p, * dst_p;
+ icmp_v4_t *icmp;
+
+ ip_hdr_len = (ip->version_hdr_len_words & 0xf) << 2; /* offset to ip payload */
+ icmp = (icmp_v4_t*) ( (u8*)ip + ip_hdr_len);
+ ip_payload_size = sizeof(icmp_v4_t) + ip_hdr_len +
+ ICMP_UNREACHABLE_IP_PAYLOAD_SIZE;
+
+ src_p = (u32*)
+ ((u8*)ip + ip_hdr_len + ICMP_UNREACHABLE_IP_PAYLOAD_SIZE - 4);
+ dst_p = (u32*) ((u8*)src_p + sizeof(ipv4_header) +
+ sizeof(icmp_v4_t));
+
+ while(src_p >= (u32*)ip) *dst_p-- = *src_p--;
+
+ /* Following ICMP op has to be after ip header being copied */
+ icmp->type = icmp_type;
+ icmp->code = icmp_code;
+ icmp->identifier = 0;
+ icmp->sequence = 0;
+
+
+ /* build icmp header, keep original tos, identification values */
+ ip->version_hdr_len_words = 0x45;
+ ip->total_len_bytes = sizeof(ipv4_header) + ip_payload_size;
+ ip->total_len_bytes = spp_host_to_net_byte_order_16(ip->total_len_bytes);
+ ip->frag_flags_offset = 0;
+ ip->ttl = 64;
+ ip->protocol = ICMP_PROT;
+ ip->checksum = 0;
+ ip->dest_addr = ip->src_addr;
+
+ ip->src_addr = spp_host_to_net_byte_order_32(svi_params_array[uidb_index].ipv4_addr);
+
+ /* calc checksum for ip and icmp */
+
+ calc_ipv4_checksum(ip);
+ calc_v4_icmp_checksum( (icmp_v4_t *) ((u8*) ip + sizeof(ipv4_header)),
+ ip_payload_size);
+#if 0
+ printf("version_hdr_len_words = 0x%x\n", ip->version_hdr_len_words);
+ printf("total_len_bytes = 0x%x\n", ip->total_len_bytes);
+ printf("Frag = 0x%x\n", ip->frag_flags_offset);
+ printf("ttl = 0x%x\n", ip->ttl);
+ printf("Protocol = 0x%x\n", ip->protocol);
+ printf("checksum = 0x%x\n", ip->checksum);
+ printf("Dest addr = 0x%x\n", ip->dest_addr);
+ printf("Src addr = 0x%x\n", ip->src_addr);
+ printf("Icmp type = 0x%x\n", icmp->type);
+ printf("Icmp code = 0x%x\n", icmp->code);
+#endif
+
+}
+
+int icmpv4_generate_with_throttling_v2 (spp_ctx_t *ctx, ipv4_header *ipv4,
+ int icmp_type, int icmp_code,
+ u16 mtu, u32 src_ip)
+{
+ u16 ip_hdr_len;
+ icmp_v4_t *icmp;
+ u16 rx_uidb_index = ctx->ru.rx.uidb_index;
+ if (icmp_msg_gen_allowed()) {
+ free_all_but_first_chained_buffers(ctx);
+ icmp_error_generate_v2(ipv4, icmp_type, icmp_code, mtu, src_ip);
+ ctx->current_length = (u16)
+ ((u8*)ctx->current_header - ctx->packet_data) +
+ spp_net_to_host_byte_order_16(&ipv4->total_len_bytes);
+ PLATFORM_CNAT_SET_TX_VRF(ctx,rx_uidb_index);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int icmpv4_generate_with_throttling (spp_ctx_t *ctx, ipv4_header *ipv4,
+ u16 rx_uidb_index)
+{
+ int icmp_type;
+ int icmp_code;
+
+ if (icmp_msg_gen_allowed()) {
+ /* ICMP error would be small, so one buffer is enough. Clear the other */
+ free_all_but_first_chained_buffers(ctx);
+
+ icmp_type = ICMPV4_TIMEEXCEEDED;
+ icmp_code = ICMPV4_TIMTTL;
+ icmp_error_generate(ipv4, icmp_type, icmp_code, rx_uidb_index);
+ ctx->current_length = (u16)
+ ((u8*)ctx->current_header - ctx->packet_data) +
+ spp_net_to_host_byte_order_16(&ipv4->total_len_bytes);
+ PLATFORM_CNAT_SET_TX_VRF(ctx,rx_uidb_index);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int icmpv4_generate_with_throttling_v1 (spp_ctx_t *ctx, ipv4_header *ipv4,
+ u16 rx_uidb_index, u32 type, u32 code)
+{
+ if (icmp_msg_gen_allowed()) {
+ /* ICMP error would be small, so one buffer is enough. Clear the other */
+ free_all_but_first_chained_buffers(ctx);
+
+ icmp_error_generate(ipv4, type, code, rx_uidb_index);
+ ctx->current_length = (u16)
+ ((u8*)ctx->current_header - ctx->packet_data) +
+ spp_net_to_host_byte_order_16(&ipv4->total_len_bytes);
+ PLATFORM_CNAT_SET_TX_VRF(ctx,rx_uidb_index);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+
+int icmpv6_generate_with_throttling (spp_ctx_t *ctx, ipv6_header_t *ipv6,
+ u16 rx_uidb_index)
+{
+ int icmp_type;
+ int icmp_code;
+
+ if (v6_icmp_msg_gen_allowed()) {
+ icmp_type = ICMPV6_TIMEEXCEEDED;
+ icmp_code = ICMPV6_TIMTTL;
+ icmp_error_generate_v6(ctx, icmp_type, icmp_code, rx_uidb_index);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int icmpv6_generate_with_throttling_v1 (spp_ctx_t *ctx, ipv6_header_t *ipv6,
+ u16 rx_uidb_index, u32 type, u32 code)
+{
+
+ if (v6_icmp_msg_gen_allowed()) {
+ icmp_error_generate_v6(ctx, type, code, rx_uidb_index);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+#endif
+
+inline
+void calculate_window_scale(tcp_hdr_type *tcp_header, u8 *scale) {
+
+ u8 check_options = 0;
+
+ *scale = 0;
+ check_options = ((tcp_header->flags & TCP_FLAG_SYN) &&
+ (((tcp_header->hdr_len>>4) << 2) > sizeof(tcp_hdr_type)));
+
+ if (PREDICT_FALSE(check_options)) {
+ u8 *options_ptr = tcp_findoption(tcp_header, TCP_OPTION_WINDOW_SCALE);
+
+ /*
+ * TCP option field: | kind 1B | len 1B | value 2B|
+ * where kind != [0, 1]
+ */
+ if (PREDICT_TRUE(options_ptr &&
+ (options_ptr[1] == TCP_OPTION_WINDOW_SCALE))) {
+ u8 *ptr = (u8*)(options_ptr + 2);
+ *scale = *ptr;
+
+ if(PREDICT_FALSE(*scale >= 14)) {
+ *scale = 14;
+ }
+
+ return;
+ }
+ }
+}
+
+#if 0
+ALWAYS_INLINE(
+void cnat_log_nat44_tcp_seq_mismatch(
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap))
+{
+ /* As of now, Netflow does not require this to be logged
+ * So only syslog
+ */
+ if(PREDICT_TRUE(db->flags & CNAT_TAC_SEQ_MISMATCH)) {
+ /* Already logged ..*/
+ return;
+ }
+ /* else, set the flag and call the log API */
+
+ db->flags = db->flags | CNAT_TAC_SEQ_MISMATCH;
+
+ cnat_syslog_nat44_tcp_seq_mismatch(db, vrfmap);
+}
+
+
+static int cnat_util_init (void *notused)
+{
+ /* run SPP_API_CNAT_PORTMAP_CREATE first*/
+ spp_msg_api_set_handler(SPP_API_CNAT_PORT_ALLOCATE,
+ spp_api_cnat_port_allocate_t_handler);
+
+
+ spp_msg_api_set_handler(SPP_API_CNAT_PORT_CLEAR,
+ spp_api_cnat_port_clear_t_handler);
+
+ /* run vrfmap config first */
+ spp_msg_api_set_handler(SPP_API_CNAT_PORT_CREATE,
+ spp_api_cnat_port_create_t_handler);
+
+ spp_msg_api_set_handler(SPP_API_CNAT_PORT_DELETE,
+ spp_api_cnat_port_delete_t_handler);
+ return 0;
+}
+
+void
+print_ipv6_pkt (ipv6_header_t *ip)
+{
+ u32 i, total_len, l4_len=0;
+
+ u8 *pkt = (u8 *) ip;
+
+ total_len = spp_net_to_host_byte_order_16(&ip->payload_length);
+
+ /* we rarely need to debug > 200 bytes of packet */
+ if(total_len > 200) {
+ total_len = 200;
+ }
+
+ printf("\n======== PRINTING PKT START======\n");
+ printf("======== IPv6 PAYLOAD LEN %d ===========\n", total_len);
+ for (i=0; i < 40; i++) {
+ printf(" %02X ", *(pkt + i));
+ if(i%16==15)
+ printf("\n");
+ }
+
+ if (ip->next_header == IPV6_PROTO_TCP) {
+ printf("\n======== TCP HEADER =================\n");
+ l4_len = 20;
+ }
+ else if (ip->next_header == IPV6_PROTO_UDP) {
+ printf("\n======== UDP HEADER =================\n");
+ l4_len = 8;
+ }
+ else if (ip->next_header == IPV6_PROTO_ICMPV6) {
+ printf("\n======== ICMP HEADER =================\n");
+ l4_len = 8;
+ }
+
+ for (i=40; i < (l4_len + 40); i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== LAYER4 PAYLOAD ===================\n");
+ for (i=(l4_len + 40); i < total_len; i++) {
+ printf(" %02X ", *(pkt + i));
+ if(i%16==15)
+ printf("\n");
+ }
+
+ printf("\n======== PRINTING PKT END =======\n");
+}
+
+
+
+PLATFORM_SPP_INIT_FUNCTION(cnat_util_init);
+#endif
diff --git a/vnet/vnet/vcgn/cnat_v4_ftp_alg.h b/vnet/vnet/vcgn/cnat_v4_ftp_alg.h
new file mode 100644
index 00000000000..df3dfcb0797
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_v4_ftp_alg.h
@@ -0,0 +1,133 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_v4_ftp_alg.h
+ *
+ * Copyright (c) 2012-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_V4_FTP_ALG_H__
+#define __CNAT_V4_FTP_ALG_H__
+
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+
+#include "tcp_header_definitions.h"
+#include "dslite_defs.h"
+#include "dslite_db.h"
+
+/* shorter form of byte order functions */
+
+#define net2host16(x) clib_net_to_host_u16( x)
+#define net2host32(x) clib_net_to_host_u32( x)
+#define net2host64(x) clib_net_to_host_u64( x)
+#define host2net16(x) clib_host_to_net_u16(x)
+#define host2net32(x) clib_host_to_net_u32(x)
+#define host2net64(x) clib_host_to_net_u64(x)
+
+//#define BIGENDIAN
+
+typedef struct iphdrtype_ {
+ u8 v_ihl; /* version and IP header length */
+ u8 tos; /* type of service */
+ u16 tl; /* total length */
+ u16 id; /* identifier */
+ u16 ipreserved: 1;
+ u16 dontfragment: 1;
+ u16 morefragments: 1;
+ u16 fo: 13; /* fragment offset */
+ u8 ttl; /* time to live */
+ u8 prot; /* protocol type */
+ u16 checksum; /* checksum */
+ u32 srcadr; /* IP source address */
+ u32 dstadr; /* IP destination address */
+} iphdrtype;
+
+
+typedef struct tcptype_ {
+ u16 sourceport;
+ u16 destinationport;
+ u32 sequencenumber;
+ u32 acknowledgementnumber;
+ u8 dataoffset;
+ u8 flags;
+#if 0
+/* bypass the ENDIAN part */
+#ifdef BIGENDIAN
+ u8 reserved: 2;
+ u8 urg: 1;
+ u8 ack: 1;
+ u8 psh: 1;
+ u8 rst: 1;
+ u8 syn: 1;
+ u8 fin: 1;
+#else
+ u8 fin: 1;
+ u8 syn: 1;
+ u8 rst: 1;
+ u8 psh: 1;
+ u8 ack: 1;
+ u8 urg: 1;
+ u8 reserved2: 2;
+#endif
+#endif
+
+ u16 window;
+ u16 checksum;
+ u16 urgentpointer;
+ u8 data[0];
+} tcptype ;
+
+
+int watch_ftp_port_cmd (iphdrtype *ip,
+ tcptype *tcp,
+ u32 * ip_addr,
+ u16 * port);
+
+
+u8 * ftp_test_pkt_gen (u32 ip_addr, u16 port);
+
+int update_ftp_port(u8 * pkt, u32 new_ip, u16 new_port, i8 * delta,
+ cnat_main_db_entry_t *db_tcp_control,
+ dslite_table_entry_t *dslite_entry_ptr,
+ ipv6_header_t *ipv6_hdr);
+/*
+ * caller needs to check if it's a ftp packet
+ * this function returns 1
+ * if packet being updated for PORT
+ * otherwise return 0.
+ * Assume IP header DOES NOT have option fields
+ */
+
+int cnat_ftp_alg ( u8* pkt, i8 * delta, cnat_main_db_entry_t *db,
+ dslite_table_entry_t *dslite_entry_ptr,
+ ipv6_header_t *ipv6_hdr);
+
+#define FTP_ALG_DEBUG_PRINTF_ENABLED 1
+
+#ifdef FTP_ALG_DEBUG_PRINTF_ENABLED
+
+#define FTP_ALG_DEBUG_PRINTF(...) { \
+ if (global_debug_flag & CNAT_DEBUG_FTP_ALG) { \
+ printf(__VA_ARGS__); \
+ } }
+
+#else
+
+#define FTP_ALG_DEBUG_PRINTF(...)
+
+#endif
+
+#endif /* __CNAT_V4_FTP_ALG_H__ */
diff --git a/vnet/vnet/vcgn/cnat_v4_functions.c b/vnet/vnet/vcgn/cnat_v4_functions.c
new file mode 100644
index 00000000000..d3051fba5a7
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_v4_functions.c
@@ -0,0 +1,364 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_v4_funtions.c
+ *
+ * Copyright (c) 2008-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+
+
+#include "tcp_header_definitions.h"
+#include "cnat_db.h"
+#include "cnat_config.h"
+#include "cnat_v4_functions.h"
+#include "dslite_defs.h"
+#include "dslite_db.h"
+
+static u32 tcp_logging_count;
+static u32 tcp_logging_overflow;
+
+static tcp_logging_struct_t tcp_logging_array[MAX_TCP_LOGGING_COUNT];
+
+/*
+ * Function to log TCP pkts checksum changes..
+ */
+void
+tcp_debug_logging (
+ u32 seq_num,
+ u32 ack_num,
+ u32 old_ip,
+ u32 new_ip,
+ u16 old_port,
+ u16 new_port,
+ u16 old_ip_crc,
+ u16 new_ip_crc,
+ u16 old_tcp_crc,
+ u16 new_tcp_crc)
+{
+ tcp_logging_array[tcp_logging_count].seq_num = seq_num;
+ tcp_logging_array[tcp_logging_count].ack_num = ack_num;
+ tcp_logging_array[tcp_logging_count].old_ip = old_ip;
+ tcp_logging_array[tcp_logging_count].new_ip = new_ip;
+ tcp_logging_array[tcp_logging_count].old_port = old_port;
+ tcp_logging_array[tcp_logging_count].new_port = new_port;
+ tcp_logging_array[tcp_logging_count].old_ip_crc = old_ip_crc;
+ tcp_logging_array[tcp_logging_count].new_ip_crc = new_ip_crc;
+ tcp_logging_array[tcp_logging_count].old_tcp_crc = old_tcp_crc;
+ tcp_logging_array[tcp_logging_count].new_tcp_crc = new_tcp_crc;
+
+ tcp_logging_count++;
+
+ if (tcp_logging_count >= MAX_TCP_LOGGING_COUNT) {
+ tcp_logging_overflow = 1;
+ tcp_logging_count = 0;
+ }
+}
+
+/*
+ * Function to dmp TCP pkts logged..
+ */
+void
+tcp_debug_logging_dump (void)
+{
+ u32 i, total_count, start_entry;
+
+ if (tcp_logging_overflow) {
+ total_count = MAX_TCP_LOGGING_COUNT;
+ start_entry = tcp_logging_count;
+ printf("Logging Entries Wrapped Around, displaying %d entries\n",
+ total_count);
+ } else {
+ total_count = tcp_logging_count;
+ start_entry = 0;
+ printf("Displaying %d entries\n", total_count);
+ }
+
+ printf("SEQ ACK IP_O IP_N PORT_O PORT_N L3_CRC_O L3_CRC_N L4_CRC_O L4_CRC_N\n");
+
+ for (i = 0; i < total_count; i++) {
+ u32 entry = (i + start_entry) % MAX_TCP_LOGGING_COUNT;
+
+ printf("%04d: 0x%08x 0x%08x 0x%08x 0x%08x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n",
+ entry,
+ tcp_logging_array[entry].seq_num,
+ tcp_logging_array[entry].ack_num,
+ tcp_logging_array[entry].old_ip,
+ tcp_logging_array[entry].new_ip,
+ tcp_logging_array[entry].old_port,
+ tcp_logging_array[entry].new_port,
+ tcp_logging_array[entry].old_ip_crc,
+ tcp_logging_array[entry].new_ip_crc,
+ tcp_logging_array[entry].old_tcp_crc,
+ tcp_logging_array[entry].new_tcp_crc);
+ }
+}
+
+/*
+ * Function to enable TCP logging
+ */
+void
+tcp_debug_logging_enable_disable (u32 enable_flag)
+{
+ switch (enable_flag) {
+
+ case TCP_LOGGING_DISABLE:
+ if (tcp_logging_enable_flag == TCP_LOGGING_DISABLE) {
+ printf("\nTCP Logging ALREADY DISABLED\n");
+ } else {
+ printf("\nTCP Logging DISABLED\n");
+ }
+ tcp_logging_enable_flag = 0;
+ break;
+
+ case TCP_LOGGING_ENABLE:
+ if (tcp_logging_enable_flag == TCP_LOGGING_ENABLE) {
+ printf("\nTCP Logging ALREADY ENABLED\n");
+ } else {
+ tcp_logging_enable_flag = 1;
+ tcp_logging_count = 0;
+ tcp_logging_overflow = 0;
+
+ printf("\nTCP Logging ENABLED\n");
+ }
+ break;
+
+ case TCP_LOGGING_PACKET_DUMP:
+ tcp_debug_logging_dump();
+ break;
+
+ case TCP_LOGGING_SUMMARY_DUMP:
+ default:
+ printf("\ntcp_logging_enable_flag %d, tcp_log_count %d\n",
+ tcp_logging_enable_flag, tcp_logging_count);
+ printf("To Enable TCP LOGGING provide a flag value of %d\n",
+ TCP_LOGGING_ENABLE);
+ break;
+ }
+}
+
+void hex_dump (u8 * p, int len) {
+ int i;
+ for (i=0;i<len;i++) {
+ if(i && (i & 0x3 ) == 0) printf(" ");
+ if(i && (i & 0xf ) == 0) printf("\n");
+ PLATFORM_DEBUG_PRINT("%02X ", p[i]);
+ }
+ PLATFORM_DEBUG_PRINT("\n");
+}
+
+void
+print_icmp_pkt (ipv4_header *ip)
+{
+ u32 i, total_len;
+
+ u8 *pkt = (u8 *) ip;
+
+ total_len = clib_net_to_host_u16(ip->total_len_bytes);
+
+ printf("\n======== PRINTING PKT START======\n");
+ printf("======== IP PACKET LEN %d ===========\n", total_len);
+ for (i=0; i < 20; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== ICMP HEADER =================\n");
+ for (i=20; i < 28; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== ICMP BODY ===================\n");
+ for (i=28; i < total_len; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== PRINTING PKT END =======\n");
+}
+
+void
+print_udp_pkt (ipv4_header *ip)
+{
+ u32 i, total_len, udp_len;
+
+ u8 *pkt = (u8 *) ip;
+
+ total_len = clib_net_to_host_u16(ip->total_len_bytes);
+ udp_len = total_len - 20;
+
+ printf("\n======== PRINTING PKT START======\n");
+ printf("======== IP PACKET LEN %d ===========\n", total_len);
+ for (i=0; i < 20; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf("\n======== UDP PSEUDO HEADER ==========\n");
+ for (i=12; i < 20; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf(" 00 11 %02X %02X ", udp_len >> 8, udp_len & 0xff);
+
+ printf("\n======== UDP HEADER =================\n");
+ for (i=20; i < 28; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf("\n======== UDP BODY ===================\n");
+ for (i=28; i < total_len; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== PRINTING PKT END =======\n");
+}
+
+void
+print_tcp_pkt (ipv4_header *ip)
+{
+ u32 i, total_len, tcp_len;
+
+ u8 *pkt = (u8 *) ip;
+
+ total_len = clib_net_to_host_u16(ip->total_len_bytes);
+ tcp_len = total_len - 20;
+
+ printf("\n======== PRINTING PKT START======\n");
+ printf("======== IP PACKET LEN %d ===========\n", total_len);
+ for (i=0; i < 20; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf("\n======== TCP PSEUDO HEADER ==========\n");
+ for (i=12; i < 20; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf(" 00 06 %02X %02X ", tcp_len >> 8, tcp_len & 0xff);
+
+ printf("\n======== TCP HEADER =================\n");
+ for (i=20; i < 40; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+ printf("\n======== TCP BODY ===================\n");
+ for (i=40; i < total_len; i++) {
+ printf(" %02X ", *(pkt + i));
+ }
+
+ printf("\n======== PRINTING PKT END =======\n");
+}
+
+/* IN: ipv4 and tcp header pointer,
+ * new ipv4 addr and port value
+ * main db index for accessing per vrf mss value
+ * DO:
+ * NAT
+ * mss adjust if needed
+ * ip & tcp checksum update (incremental)
+ */
+
+inline void tcp_in2out_nat_mss_n_checksum (ipv4_header * ip,
+ tcp_hdr_type * tcp,
+ u32 ipv4_addr,
+ u16 port,
+ cnat_main_db_entry_t * db)
+{
+ u8 *mss_ptr;
+ u8 check_mss = 0;
+ u16 mss_old, mss_new;
+ cnat_vrfmap_t * vrf_map_p;
+
+ cnat_v4_recalculate_tcp_checksum(ip,
+ tcp,
+ &(ip->src_addr),
+ &(tcp->src_port),
+ ipv4_addr,
+ port);
+ u16 frag_offset =
+ clib_net_to_host_u16(ip->frag_flags_offset);
+
+ if(PREDICT_FALSE(frag_offset & IP_FRAG_OFFSET_MASK)) {
+ return; /* No TCP Header at all */
+ }
+
+ /*
+ * check SYN bit and if options field is present
+ * If yes, proceed to extract the options and get TCP MSS value
+ */
+ check_mss = ((tcp->flags & TCP_FLAG_SYN) &&
+ (((tcp->hdr_len>>4) << 2) > sizeof(tcp_hdr_type)));
+
+ if (PREDICT_FALSE(check_mss)) {
+
+ /* get per VRF mss config */
+ if(PREDICT_FALSE(db->flags & (CNAT_DB_DSLITE_FLAG))) {
+ mss_new = dslite_table_db_ptr[db->dslite_nat44_inst_id].tcp_mss;
+ } else {
+ vrf_map_p = cnat_map_by_vrf + db->vrfmap_index;
+ mss_new = vrf_map_p->tcp_mss;
+ }
+ DSLITE_PRINTF(1, "Check MSS true..%u\n", mss_new);
+ /*
+ * If TCP MSS is not configured, skip the MSS checks
+ */
+ if (PREDICT_FALSE(mss_new != V4_TCP_MSS_NOT_CONFIGURED_VALUE)) {
+
+ /* if mss_ptr != NULL, then it points to MSS option */
+ mss_ptr = tcp_findoption(tcp, TCP_OPTION_MSS);
+
+ /*
+ * TCP option field: | kind 1B | len 1B | value 2B|
+ * where kind != [0,1]
+ */
+ if (PREDICT_TRUE(mss_ptr && (mss_ptr[1] == 4))) {
+
+ u16 *ptr = (u16*)(mss_ptr + 2);
+
+ mss_old = clib_net_to_host_u16(*ptr);
+
+ if (PREDICT_FALSE(mss_old > mss_new)) {
+ u32 sum32;
+ u16 mss_old_r, old_tcp_checksum_r;
+
+ *ptr = clib_host_to_net_u16(mss_new);
+
+ mss_old_r = ~mss_old;
+
+ old_tcp_checksum_r =
+ ~clib_net_to_host_u16(tcp->tcp_checksum);
+
+ /*
+ * Revise the TCP checksum
+ */
+ sum32 = old_tcp_checksum_r + mss_old_r + mss_new;
+ FILL_CHECKSUM(tcp->tcp_checksum, sum32)
+
+ if (PREDICT_FALSE(tcp_logging_enable_flag)) {
+ tcp_debug_logging(
+ clib_net_to_host_u32(tcp->seq_num),
+ clib_net_to_host_u32(tcp->ack_num),
+ 0,
+ 0,
+ mss_old,
+ mss_new,
+ 0,
+ 0,
+ ~old_tcp_checksum_r,
+ clib_net_to_host_u16(tcp->tcp_checksum));
+ }
+ }
+ }
+ }
+ }
+}
+
+u32 get_my_svi_intf_ip_addr() {
+ return 0x01010101;
+}
diff --git a/vnet/vnet/vcgn/cnat_v4_functions.h b/vnet/vnet/vcgn/cnat_v4_functions.h
new file mode 100644
index 00000000000..047fe33cc4c
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_v4_functions.h
@@ -0,0 +1,342 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_v4_functions.h
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_V4_FUNCTOINS__
+#define __CNAT_V4_FUNCTOINS__
+
+#include "tcp_header_definitions.h"
+#include "cnat_db.h"
+#include "spp_ctx.h"
+
+#include "platform_common.h"
+
+/*
+ * Defines and structures to enable TCP packet logging
+ */
+#define TCP_LOGGING_DISABLE 0
+#define TCP_LOGGING_ENABLE 1
+#define TCP_LOGGING_PACKET_DUMP 2
+#define TCP_LOGGING_SUMMARY_DUMP 3
+
+#define MAX_TCP_LOGGING_COUNT 1024
+
+typedef struct tcp_logging_struct {
+ u32 seq_num;
+ u32 ack_num;
+ u32 old_ip;
+ u32 new_ip;
+ u16 old_port;
+ u16 new_port;
+ u16 old_ip_crc;
+ u16 new_ip_crc;
+ u16 old_tcp_crc;
+ u16 new_tcp_crc;
+} tcp_logging_struct_t;
+
+void tcp_debug_logging_dump (void);
+void tcp_debug_logging_enable_disable (u32 enable_flag);
+
+void
+tcp_debug_logging (
+ u32 seq_num,
+ u32 ack_num,
+ u32 old_ip,
+ u32 new_ip,
+ u16 old_port,
+ u16 new_port,
+ u16 old_ip_crc,
+ u16 new_ip_crc,
+ u16 old_tcp_crc,
+ u16 new_tcp_crc);
+
+#define JLI printf("%s %s %d\n", __FILE__, __FUNCTION__, __LINE__); fflush(stdout);
+
+#define CNAT_ICMP_DEST_UNREACHABLE 100
+#define INCREMENT_NODE_COUNTER(c) \
+ em->counters[node_counter_base_index + c] += 1;
+
+#define V4_TCP_UPDATE_SESSION_FLAG(db, tcp) \
+if ((tcp->flags & TCP_FLAG_ACK) && (tcp->flags & TCP_FLAG_SYN)) { \
+ db->flags |= CNAT_DB_FLAG_TCP_ACTIVE; \
+} \
+if ((tcp->flags & TCP_FLAG_RST) || (tcp->flags & TCP_FLAG_FIN)) { \
+ db->flags &= ~CNAT_DB_FLAG_TCP_ACTIVE; \
+ db->flags |= CNAT_DB_FLAG_TCP_CLOSING; \
+}
+
+#define V4_TCP_UPDATE_SESSION_DB_FLAG(sdb, tcp) \
+if ((tcp->flags & TCP_FLAG_ACK) && (tcp->flags & TCP_FLAG_SYN)) { \
+ sdb->flags |= CNAT_DB_FLAG_TCP_ACTIVE; \
+} \
+if ((tcp->flags & TCP_FLAG_RST) || (tcp->flags & TCP_FLAG_FIN)) { \
+ sdb->flags &= ~CNAT_DB_FLAG_TCP_ACTIVE; \
+ sdb->flags |= CNAT_DB_FLAG_TCP_CLOSING; \
+}
+
+/*
+ * Code to recalculate checksum after ACK/SEQ number changes
+ * This macro assumes, we have pointer to tcp structure
+ * referenced by the name "tcp"
+ */
+#define CNAT_UPDATE_TCP_SEQ_ACK_CHECKSUM(old_val32, new_val32) \
+{ \
+ u16 old_val_lower, old_val_upper, old_tcp_cr; \
+ u16 new_val_lower, new_val_upper, new_tcp_cr; \
+ u32 sum32; \
+ \
+ old_val_lower = ~((u16) old_val32); \
+ old_val_upper = ~((u16) (old_val32 >> 16)); \
+ old_tcp_cr = ~net2host16(&tcp->tcp_checksum); \
+ new_val_lower = (u16) new_val32; \
+ new_val_upper = (u16) (new_val32 >> 16); \
+ \
+ sum32 = old_val_lower + old_val_upper + old_tcp_cr + \
+ new_val_lower + new_val_upper; \
+ \
+ sum32 = (sum32 & 0xffff) + ((sum32 >> 16) & 0xffff); \
+ sum32 = (sum32 & 0xffff) + ((sum32 >> 16) & 0xffff); \
+ new_tcp_cr = ~((u16)sum32); \
+ \
+ tcp->tcp_checksum = host2net16(new_tcp_cr); \
+}
+
+/*
+ * newchecksum = ~(~oldchecksum + ~old + new)
+ * old/new for l3 checksum: ip address
+ */
+#define CNAT_UPDATE_L3_CHECKSUM_DECLARE \
+u16 old_l3_1r, old_l3_2r; \
+u16 old_l3_cr, new_l3_c; \
+u32 new32;
+
+#define CNAT_UPDATE_L3_CHECKSUM(old_l3_1, old_l3_2, old_l3_c, \
+ new_l3_1, new_l3_2) \
+old_l3_1r = ~(old_l3_1); \
+old_l3_2r = ~(old_l3_2); \
+old_l3_cr = ~(old_l3_c); \
+new32 = old_l3_cr + old_l3_1r + old_l3_2r + new_l3_1 + new_l3_2; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_l3_c = ~((u16)new32);
+
+
+/*
+ * newchecksum = ~(~oldchecksum + ~old + new)
+ * old/new for l3 checksum: ip address
+ * old/new for l4 checksum: ip address and port
+ */
+#define CNAT_UPDATE_L3_L4_CHECKSUM_DECLARE \
+u16 old_l3_1r, old_l3_2r, old_l4r; \
+u16 old_l3_cr, old_l4_cr; \
+u16 new_l3_c, new_l4_c; \
+u32 sum32, new32;
+
+#define CNAT_UPDATE_L3_L4_CHECKSUM(old_l3_1, old_l3_2, old_l4, \
+ old_l3_c, old_l4_c, \
+ new_l3_1, new_l3_2, new_l4) \
+old_l3_1r = ~(old_l3_1); \
+old_l3_2r = ~(old_l3_2); \
+old_l3_cr = ~(old_l3_c); \
+sum32 = old_l3_1r + old_l3_2r + new_l3_1 + new_l3_2; \
+new32 = old_l3_cr + sum32; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_l3_c = ~((u16)new32); \
+old_l4r = ~(old_l4); \
+old_l4_cr = ~(old_l4_c); \
+sum32 += old_l4r + new_l4; \
+new32 = old_l4_cr + sum32; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_l4_c = ~((u16)new32);
+
+/*
+ * For ICMP checksums, we don't use the top IP header for checksum calculation
+ */
+#define CNAT_UPDATE_L3_ICMP_CHECKSUM(old_l3_1, old_l3_2, old_l4, \
+ old_l3_c, old_l4_c, \
+ new_l3_1, new_l3_2, new_l4) \
+old_l3_1r = ~(old_l3_1); \
+old_l3_2r = ~(old_l3_2); \
+old_l3_cr = ~(old_l3_c); \
+sum32 = old_l3_1r + old_l3_2r + new_l3_1 + new_l3_2; \
+new32 = old_l3_cr + sum32; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_l3_c = ~((u16)new32); \
+old_l4r = ~(old_l4); \
+old_l4_cr = ~(old_l4_c); \
+sum32 = old_l4r + new_l4; \
+new32 = old_l4_cr + sum32; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_l4_c = ~((u16)new32);
+
+
+/*
+ * icmp error type message:
+ * newchecksum = ~(~oldchecksum + ~old + new)
+ * old/new for outlayer ip checksum: ip address
+ * old/new for outlayer icmp checksum:
+ * out-layer: ip address
+ * inner-layer: ip addr, port, l3 checksum, l4 checksum
+ */
+#define CNAT_UPDATE_ICMP_ERR_CHECKSUM_DECLARE \
+u16 old_ip_1r, old_ip_2r, old_ip_port_r, old_ip_cr, old_icmp_cr; \
+u16 new_icmp_c; \
+u32 sum32;
+
+
+#define CNAT_UPDATE_ICMP_ERR_CHECKSUM(old_ip_1, old_ip_2, old_ip_port, old_ip_c, old_icmp_c, \
+ new_ip_1, new_ip_2, new_ip_port, new_ip_c) \
+old_ip_1r = ~(old_ip_1); \
+old_ip_2r = ~(old_ip_2); \
+old_ip_port_r = ~(old_ip_port); \
+old_ip_cr = ~(old_ip_c); \
+old_icmp_cr = ~(old_icmp_c); \
+sum32 = old_ip_1r + old_ip_2r + new_ip_1 + new_ip_2 + \
+ old_ip_port_r + new_ip_port + old_ip_cr + new_ip_c; \
+new32 = old_icmp_cr + sum32; \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new32 = (new32 & 0xffff) + ((new32 >> 16) & 0xffff); \
+new_icmp_c = ~((u16)new32); \
+
+/*
+ * Add the two 16 bit parts of the 32 bit field
+ * Repeat it one more time to take care of any overflow
+ * Complement the u16 value and store it in network format
+ */
+#define FILL_CHECKSUM(checksum_field, sum32) { \
+ sum32 = (sum32 & 0xffff) + ((sum32>>16) & 0xffff); \
+ sum32 = (sum32 & 0xffff) + ((sum32>>16) & 0xffff); \
+ checksum_field = clib_host_to_net_u16(~((u16) sum32)); \
+}
+
+static inline void
+cnat_v4_recalculate_tcp_checksum (ipv4_header *ip,
+ tcp_hdr_type *tcp,
+ u32 *ip_addr_ptr,
+ u16 *tcp_port_addr_ptr,
+ u32 new_ip,
+ u16 new_port)
+{
+ u32 old_ip_addr, old_ip32_r, new_ip32, sum32;
+ u16 old_port_r, old_ip_checksum_r, old_tcp_checksum_r;
+
+ u16 *p16;
+
+ p16 = (u16*) ip_addr_ptr;
+
+ old_ip_addr = *ip_addr_ptr;
+ old_ip32_r = (((u16) ~clib_net_to_host_u16(*p16)) +
+ ((u16) ~clib_net_to_host_u16(*(p16+1))));
+
+ old_port_r = ~clib_net_to_host_u16(*tcp_port_addr_ptr);
+
+ *ip_addr_ptr = clib_host_to_net_u32(new_ip);
+
+ new_ip32 = (new_ip & 0xffff) + ((new_ip >> 16) & 0xffff);
+
+ old_ip_checksum_r = ~clib_net_to_host_u16(ip->checksum);
+
+ /*
+ * Recalculate the new IP checksum
+ */
+ sum32 = old_ip32_r + new_ip32 + old_ip_checksum_r;
+
+ FILL_CHECKSUM(ip->checksum, sum32);
+
+ u16 frag_offset =
+ clib_net_to_host_u16((ip->frag_flags_offset));
+
+ if(PREDICT_FALSE(frag_offset & IP_FRAG_OFFSET_MASK)) {
+ return; /* No need to update TCP fields */
+ }
+
+ *tcp_port_addr_ptr = clib_host_to_net_u16(new_port);
+ old_tcp_checksum_r = ~clib_net_to_host_u16(tcp->tcp_checksum);
+
+ /*
+ * Recalculate the new TCP checksum
+ */
+ sum32 = old_ip32_r + new_ip32 +
+ old_port_r + new_port + old_tcp_checksum_r;
+
+ FILL_CHECKSUM(tcp->tcp_checksum, sum32);
+
+ if (PREDICT_FALSE(tcp_logging_enable_flag)) {
+ tcp_debug_logging(
+ clib_net_to_host_u32(tcp->seq_num),
+ clib_net_to_host_u32(tcp->ack_num),
+ clib_net_to_host_u32(old_ip_addr),
+ clib_net_to_host_u32(*ip_addr_ptr),
+ ~old_port_r,
+ clib_net_to_host_u16(*tcp_port_addr_ptr),
+ ~old_ip_checksum_r,
+ clib_net_to_host_u16(ip->checksum),
+ ~old_tcp_checksum_r,
+ clib_net_to_host_u16(tcp->tcp_checksum));
+ }
+}
+
+
+inline void tcp_in2out_nat_mss_n_checksum (ipv4_header *ip,
+ tcp_hdr_type *tcp,
+ u32 ipv4_addr,
+ u16 port,
+ cnat_main_db_entry_t * db);
+
+void hex_dump(u8 * p, int len);
+
+u32 get_my_svi_intf_ip_addr();
+
+/*
+ * in cnat_v4_icmp_gen.c,
+ * return 1 if icmp msg allow to generate
+ * for this user
+ */
+
+u32 icmp_msg_gen_allowed ();
+
+cnat_icmp_msg_t v6_icmp_msg_gen_allowed();
+
+int v4_crc_zero_udp_allowed();
+void ipv4_decr_ttl_n_calc_csum(ipv4_header *ipv4);
+int icmpv4_generate_with_throttling (spp_ctx_t *ctx, ipv4_header *ipv4,
+ u16 rx_uidb_index);
+
+int icmpv6_generate_with_throttling (spp_ctx_t *ctx, ipv6_header_t *ipv4,
+ u16 rx_uidb_index);
+
+void icmp_error_generate_v6(spp_ctx_t *ctx, u8 icmp_type,
+ u8 icmp_code, u16 uidb_index);
+
+void calculate_window_scale(tcp_hdr_type *tcp_header, u8 *scale);
+
+void cnat_log_nat44_tcp_seq_mismatch(
+ cnat_main_db_entry_t *db,
+ cnat_vrfmap_t *vrfmap);
+void print_icmp_pkt (ipv4_header *ip);
+void print_udp_pkt (ipv4_header *ip);
+void print_tcp_pkt (ipv4_header *ip);
+void print_ipv6_pkt (ipv6_header_t *ip);
+
+
+#endif
+
diff --git a/vnet/vnet/vcgn/cnat_v4_pptp_alg.h b/vnet/vnet/vcgn/cnat_v4_pptp_alg.h
new file mode 100644
index 00000000000..5a6d4243165
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_v4_pptp_alg.h
@@ -0,0 +1,150 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_v4_pptp_alg.h
+ *
+ * Copyright (c) 2009-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_V4_PPTP_ALG_H__
+#define __CNAT_V4_PPTP_ALG_H__
+
+/* Debug utils of PPTP */
+#define PPTP_DBG(debug, ...) \
+ if(PREDICT_FALSE(cnat_pptp_debug_flag >= debug)) { \
+ PLATFORM_DEBUG_PRINT("%s:%s:%d - ", \
+ __FILE__, __FUNCTION__, __LINE__);\
+ PLATFORM_DEBUG_PRINT(__VA_ARGS__);\
+ PLATFORM_DEBUG_PRINT("\n"); \
+ }
+
+#define PPTP_DUMP_PACKET(ip, len) pptp_hex_dump(ip, len)
+
+
+#define PPTP_DISABLED 0
+#define PPTP_ENABLED 1
+
+#define PPTP_GRE_TIMEOUT 60 /*sec */
+
+#define TCP_PPTP_PORT 1723
+
+#define PPTP_PAC 0
+#define PPTP_PNS 1
+
+/* PPTP MSG TYPE */
+
+#define PPTP_MSG_TYPE_CONTROL 1
+#define PPTP_MSG_TYPE_MGMT 2
+
+/* PPTP control messages */
+
+/* control connection mgmt */
+#define PPTP_START_CC_RQ 1
+#define PPTP_START_CC_RP 2
+#define PPTP_STOP_CC_RQ 3
+#define PPTP_STOP_CC_RP 4
+#define PPTP_ECHO_RQ 5
+#define PPTP_ECHO_RP 6
+
+/* call mgmt */
+#define PPTP_OBOUND_CALL_RQ 7
+#define PPTP_OBOUND_CALL_RP 8
+#define PPTP_IBOUND_CALL_RQ 9
+#define PPTP_IBOUND_CALL_RP 10
+#define PPTP_IBOUND_CALL_CN 11
+#define PPTP_CALL_CLEAR_RQ 12
+#define PPTP_CALL_DISCON_NT 13
+
+/* other */
+
+#define PPTP_WAN_ERR_NT 14
+#define PPTP_SET_LINK_INF 15
+
+#define PPTP_MIN_HDR_LEN 8
+
+/* Byte offsets from start of TCP Data(PPTP header) */
+
+#define PPTP_CTRL_MGMT_TYPE_OFFSET 0x02
+#define PPTP_CC_TYPE_OFFSET 0x08
+#define PPTP_HDR_CALL_ID_OFFSET 0x0c
+#define PPTP_HDR_PEER_CALL_ID_OFFSET 0x0e
+
+#define PPTP_HDR_RESULT_CODE_OFFSET_STCCRP 0x0e
+#define PPTP_HDR_RESULT_CODE_OFFSET 0x10
+
+
+/* Offset of control/mgmt msg types
+ from start of TCP header */
+
+#define TCP_HEADER_SIZE(tcp) \
+ ((tcp->hdr_len>>4) << 2)
+
+
+#define PPTP_MSG_START_OFFSET(tcp) \
+ ((u8*)tcp + TCP_HEADER_SIZE(tcp))
+
+
+#define PPTP_CC_MSG_TYPE_OFFSET(tcp) \
+ (PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_CC_TYPE_OFFSET )
+
+#define PPTP_MGMT_MSG_TYPE_OFFSET(tcp) \
+ ( PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_CTRL_MGMT_TYPE_OFFSET )
+
+#define PPTP_CALL_ID_OFFSET(tcp) \
+ ( PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_HDR_CALL_ID_OFFSET )
+
+#define PPTP_PEER_CALL_ID_OFFSET(tcp) \
+ ( PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_HDR_PEER_CALL_ID_OFFSET )
+
+#define PPTP_RESULT_CODE_OFFSET(tcp) \
+ ( PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_HDR_RESULT_CODE_OFFSET )
+
+#define PPTP_RESULT_CODE_OFFSET_STCCRP(tcp) \
+ ( PPTP_MSG_START_OFFSET(tcp) + \
+ PPTP_HDR_RESULT_CODE_OFFSET_STCCRP)
+
+/* values */
+#define PPTP_CC_MSG_TYPE(tcp) \
+ (u16*)PPTP_CC_MSG_TYPE_OFFSET(tcp)
+
+#define PPTP_MGMT_MSG_TYPE(tcp) \
+ (u16*)PPTP_MGMT_MSG_TYPE_OFFSET(tcp)
+
+#define PPTP_CALL_ID(tcp) \
+ (u16*)PPTP_CALL_ID_OFFSET(tcp)
+
+#define PPTP_PEER_CALL_ID(tcp) \
+ (u16*)PPTP_PEER_CALL_ID_OFFSET(tcp)
+
+#define PPTP_RESULT_CODE(tcp) \
+ *(u8*)PPTP_RESULT_CODE_OFFSET(tcp);
+
+#define PPTP_RESULT_CODE_STCCRP(tcp) \
+ *(u8*)PPTP_RESULT_CODE_OFFSET_STCCRP(tcp);
+
+
+/* other code */
+#define PPTP_CHAN_SUCCESS 1
+
+
+/* Data structures */
+
+extern u32 cnat_pptp_debug_flag;
+
+#endif /* __CNAT_V4_PPTP_ALG_H__ */
diff --git a/vnet/vnet/vcgn/cnat_v4_tcp_in2out_stages.c b/vnet/vnet/vcgn/cnat_v4_tcp_in2out_stages.c
new file mode 100644
index 00000000000..220ced461aa
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_v4_tcp_in2out_stages.c
@@ -0,0 +1,679 @@
+/*
+ *---------------------------------------------------------------------------
+ * cnat_v4_tcp_in2out_stages.c - cnat_v4_tcp_in2out node pipeline stage functions
+ *
+ *
+ * Copyright (c) 2008-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vppinfra/error.h>
+#include <vnet/buffer.h>
+
+#include "cnat_db.h"
+/* #include <cnat_feature_data.h> */
+#include "ipv4_packet.h"
+#include "tcp_header_definitions.h"
+#include "cnat_config.h"
+#include "cnat_global.h"
+#include "cnat_v4_functions.h"
+#include "cnat_v4_ftp_alg.h"
+#include "cnat_v4_pptp_alg.h"
+
+#define foreach_cnat_ipv4_tcp_inside_input_error \
+_(TCP_NAT_IN, "packets received") \
+_(TCP_NAT, "packets NATed") \
+_(TCP_EXCEPTION, "packets to exception") \
+_(TCP_TTL_GEN, "Generated TTL Expiry ICMP packet") \
+_(TCP_TTL_DROP, "Could not generate TTL Expiry ICMP packet") \
+_(TCP_SESSION_DROP, "Could not generate session") \
+_(TCP_FRAG_DROP, "Non-first Fragment received")
+
+typedef enum {
+#define _(sym,str) sym,
+ foreach_cnat_ipv4_tcp_inside_input_error
+#undef _
+ CNAT_IPV4_TCP_INSIDE_INPUT_N_ERROR,
+} cnat_ipv4_tcp_inside_input_t;
+
+static char * cnat_ipv4_tcp_inside_input_error_strings[] = {
+#define _(sym,string) string,
+ foreach_cnat_ipv4_tcp_inside_input_error
+#undef _
+
+
+
+typedef struct cnat_v4_tcp_in2out_pipeline_data_ {
+ spp_node_main_vector_t *nmv;
+ /* Add additional pipeline stage data here... */
+ u32 bucket;
+ u16 src_port; /* Added for handling fragments */
+ u16 dst_port; /* Added for handling fragments */
+} cnat_v4_tcp_in2out_pipeline_data_t;
+
+static cnat_v4_tcp_in2out_pipeline_data_t pctx_data[SPP_MAXDISPATCH];
+
+#define EXTRA_PIPELINE_ARGS_PROTO , cnat_v4_tcp_in2out_pipeline_data_t *pctx
+#define EXTRA_PIPELINE_ARGS , pctx
+
+ALWAYS_INLINE(
+static inline void
+stage0(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ spp_ctx_t *ctx = ctxs[index];
+ /*
+ * Prefetch the context header. This is almost always
+ * the right thing to do
+ */
+ SPP_PREFETCH_CTX(ctx);
+}
+
+ALWAYS_INLINE(
+static inline void
+stage1(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ spp_ctx_t *ctx = ctxs[index];
+ /* got ctx, prefetch packet data separately */
+ SPP_PREFETCH_CTX_DATA(ctx, 1*CACHE_DATA_QUANTUM);
+}
+
+ALWAYS_INLINE(
+static inline void
+stage2(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ spp_ctx_t *ctx = ctxs[index];
+ u64 a, b, c;
+ u32 bucket;
+ cnat_feature_data_t *fd = (cnat_feature_data_t *)ctx->feature_data;
+ ipv4_header *ip;
+ tcp_hdr_type * tcp;
+ u8 *prefetch_target;
+
+ INCREMENT_NODE_COUNTER(np, TCP_NAT_IN);
+
+ /* extract the key from ctx and save it to feature_data */
+
+ ip = (ipv4_header *)(ctx->current_header);
+ ctx->application_start = (ip->version_hdr_len_words & 0xf) << 2;
+ tcp = (tcp_hdr_type*) ((u8 *)ip + ctx->application_start);
+
+ PLATFORM_CNAT_SET_RX_VRF(ctx,fd->dbl.k.k.vrf, CNAT_TCP, 1);
+ fd->dbl.k.k.ipv4 = spp_net_to_host_byte_order_32(&ip->src_addr);
+
+ if(PREDICT_FALSE(ctx->ru.rx.frag)) {
+ /* Must have routed through cnat_v4_frag_in2out node
+ * Since feature data of the ctx is being used for other
+ * purposes here, copy them to extra stage argument
+ */
+ u16 *feature_data_ports = (u16 *)&ctx->feature_data[2];
+ pctx[index].src_port = fd->dbl.k.k.port = *feature_data_ports;
+ feature_data_ports++;
+ pctx[index].dst_port = *feature_data_ports;
+ } else {
+ fd->dbl.k.k.port = spp_net_to_host_byte_order_16(&tcp->src_port);
+ pctx[index].dst_port =
+ spp_net_to_host_byte_order_16(&tcp->dest_port);
+ }
+
+#if 0
+ /* extra info for evil mode, or default value for dst_ipv4 field in good mode */
+ fd->dbl.dst_ipv4 = address_dependent_filtering ?
+ spp_net_to_host_byte_order_32(&ip->dest_addr) : 0;
+#endif
+
+ CNAT_V4_GET_HASH(fd->dbl.k.key64,
+ bucket, CNAT_MAIN_HASH_MASK)
+
+ prefetch_target = (u8 *)(&cnat_in2out_hash[bucket]);
+ pctx[index].bucket = bucket;
+
+ /* Prefetch the hash bucket */
+ SPP_PREFETCH(prefetch_target, 0, LOAD);
+
+}
+
+ALWAYS_INLINE(
+static inline void
+stage3(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ u32 db_index;
+ u32 bucket;
+ uword prefetch_target0, prefetch_target1;
+
+ bucket = pctx[index].bucket;
+
+ /* read the hash bucket */
+ db_index = pctx[index].bucket = cnat_in2out_hash[bucket].next;
+ if (PREDICT_TRUE(db_index != EMPTY)) {
+
+ /*
+ * Prefetch database keys. We save space by not cache-line
+ * aligning the DB entries. We don't want to waste LSU
+ * bandwidth prefetching stuff we won't need.
+ */
+
+ prefetch_target0 = (uword)(cnat_main_db + db_index);
+
+ SPP_PREFETCH(prefetch_target0, 0, LOAD);
+
+ /* Just beyond DB key #2 */
+
+ prefetch_target1 = prefetch_target0 +
+ STRUCT_OFFSET_OF(cnat_main_db_entry_t, user_ports);
+
+ /* If the targets are in different lines, do the second prefetch */
+
+ if (PREDICT_FALSE((prefetch_target0 & ~(SPP_CACHE_LINE_BYTES-1)) !=
+ (prefetch_target1 & ~(SPP_CACHE_LINE_BYTES-1)))) {
+
+ SPP_PREFETCH(prefetch_target1, 0, LOAD);
+
+ }
+ }
+}
+
+static inline void
+stage4(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO)
+{
+ spp_ctx_t *ctx = ctxs[index];
+ u32 db_index = pctx[index].bucket;
+ cnat_main_db_entry_t *db;
+ cnat_feature_data_t *fd;
+
+ /*
+ * Note: if the search already failed (empty bucket),
+ * the answer is already in the pipeline context structure
+ */
+ if (PREDICT_FALSE(db_index == EMPTY)) {
+ return;
+ }
+
+ fd = (cnat_feature_data_t *)ctx->feature_data;
+
+ /*
+ * Note: hash collisions suck. We can't easily prefetch around them.
+ * The first trip around the track will be fast. After that, maybe
+ * not so much...
+ */
+ do {
+
+ db = cnat_main_db + db_index;
+ if (PREDICT_TRUE(db->in2out_key.key64 == fd->dbl.k.key64))
+ break;
+ db_index = db->in2out_hash.next;
+
+ } while (db_index != EMPTY);
+
+ /* even in evil mode, for in2out, we nat all packets regardless mode and dst_ip */
+
+ /* Stick the answer back into the pipeline context structure */
+ pctx[index].bucket = db_index;
+}
+
+ALWAYS_INLINE(
+static inline void
+stage5(spp_ctx_t **ctxs, int index, spp_node_t *np,
+ u8 *disp_used EXTRA_PIPELINE_ARGS_PROTO))
+{
+ spp_ctx_t *ctx = ctxs[index];
+ u32 db_index = pctx[index].bucket;
+ cnat_feature_data_t *fd = (cnat_feature_data_t *)ctx->feature_data;
+ int disposition;
+ cnat_main_db_entry_t *db;
+ /* Below two pointers are just to keep the cnat_ftp_alg call happy*/
+ dslite_table_entry_t *dslite_entry_ptr = NULL;
+ ipv6_header_t *ipv6_hdr = NULL;
+ tcp_hdr_type *tcp;
+ ipv4_header *ip;
+ i8 delta;
+ u32 seq, seq1;
+ u32 window;
+ u8 scale;
+ int rc;
+
+ ip = (ipv4_header *) ctx->current_header;
+
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ if (PREDICT_FALSE(ip->ttl <= 1)) {
+ /* Try to generate ICMP error msg, as TTL is <= 1 */
+
+ if (icmpv4_generate_with_throttling
+ (ctx, ip, ctx->ru.rx.uidb_index)) {
+ /* Generated ICMP */
+ disposition = CNAT_REWRITE_OUTPUT;
+ INCREMENT_NODE_COUNTER(np, TCP_TTL_GEN);
+ } else {
+ /* Could not generated ICMP - drop the packet */
+ disposition = CNAT_DROP;
+ INCREMENT_NODE_COUNTER(np, TCP_TTL_DROP);
+ }
+ goto drop_pkt;
+ }
+ }
+
+ if (PREDICT_FALSE(db_index == EMPTY)) {
+ if(PREDICT_FALSE(ctx->ru.rx.frag)) {
+ /* Must have routed through cnat_v4_frag_in2out node */
+ u16 frag_offset =
+ spp_net_to_host_byte_order_16(&(ip->frag_flags_offset));
+ if(PREDICT_FALSE(frag_offset & IP_FRAG_OFFSET_MASK)) {
+ INCREMENT_NODE_COUNTER(np, TCP_FRAG_DROP);
+ disposition = CNAT_DROP;
+ goto drop_pkt;
+ } else {
+ INCREMENT_NODE_COUNTER(np, TCP_EXCEPTION);
+ disposition = CNAT_V4_TCP_IE;
+ }
+ } else {
+ INCREMENT_NODE_COUNTER(np, TCP_EXCEPTION);
+ disposition = CNAT_V4_TCP_IE;
+ }
+ } else {
+ cnat_key_t dest_info;
+ cnat_session_entry_t *session_db = NULL;
+ db = cnat_main_db + db_index;
+ /* Handle destination sessions */
+ tcp = (tcp_hdr_type*) ((u8*)ip + ctx->application_start);
+ dest_info.k.port = pctx[index].dst_port;
+ dest_info.k.ipv4 = spp_net_to_host_byte_order_32(&(ip->dest_addr));
+
+ if(PREDICT_TRUE(!PLATFORM_DBL_SUPPORT)) {
+
+ /* No DBL support, so just update the destn and proceed */
+ db->dst_ipv4 = dest_info.k.ipv4;
+ db->dst_port = dest_info.k.port;
+ goto update_pkt;
+ }
+
+ if(PREDICT_FALSE(db->dst_ipv4 != dest_info.k.ipv4 ||
+ db->dst_port != dest_info.k.port)) {
+ if(PREDICT_TRUE(db->nsessions == 0)) {
+ /* Should be a static entry
+ * Note this session as the first session and log
+ */
+ cnat_add_dest_n_log(db, &dest_info);
+ } else if(PREDICT_FALSE(db->nsessions == 1)) {
+ /* Destn is not same as in main db. Multiple session
+ * scenario
+ */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ session_db = cnat_handle_1to2_session(db, &dest_info);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_DROP;
+ INCREMENT_NODE_COUNTER(np, TCP_SESSION_DROP);
+ goto drop_pkt;
+ }
+ } else { /* There are already multiple destinations */
+ dest_info.k.vrf = db->in2out_key.k.vrf;
+ /* If session already exists,
+ * cnat_create_session_db_entry will return the existing db
+ * else create a new db
+ * If could not create, return NULL
+ */
+ session_db = cnat_create_session_db_entry(&dest_info,
+ db, TRUE);
+ if(PREDICT_FALSE(session_db == NULL)) {
+ disposition = CNAT_DROP;
+ INCREMENT_NODE_COUNTER(np, TCP_SESSION_DROP);
+ goto drop_pkt;
+ }
+ }
+ if(PREDICT_TRUE(session_db)) {
+ /* Have to repeat the window size check for new destinations */
+ window = (u32)spp_net_to_host_byte_order_16(
+ &tcp->window_size);
+ window = window << session_db->scale;
+ if(PREDICT_TRUE(!session_db->window)) {
+ calculate_window_scale(tcp, &scale);
+ session_db->scale = scale;
+ session_db->window = window;
+ } else if (PREDICT_FALSE(session_db->window <
+ window)) {
+ /* Update the db entry with window option from packet */
+ session_db->window = window;
+ } else {
+ /* Do nothing */
+ }
+ session_db->tcp_seq_num = spp_net_to_host_byte_order_32(
+ &tcp->seq_num);
+ session_db->ack_no = spp_net_to_host_byte_order_32(
+ &tcp->ack_num);
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ PLATFORM_DEBUG_PRINT("\n In2out SDB stages seq no = %u,"
+ " ack no = %u, window = %u\n",
+ session_db->tcp_seq_num,
+ session_db->ack_no,
+ session_db->window);
+ }
+ }
+ } else {
+ //Update the seq no and ack no for subsequent communication
+ //after connection establishment
+ //No need to update window here. Window is already updated
+ //during connection establishment
+ window = (u32)spp_net_to_host_byte_order_16(
+ &tcp->window_size);
+ window = window << db->scale;
+ if(PREDICT_FALSE(!ALG_ENABLED_DB(db))) {
+ //This check is done since proto_data is part of union in main
+ //db entry
+ db->proto_data.tcp_seq_chk.seq_no =
+ spp_net_to_host_byte_order_32(
+ &tcp->seq_num);
+ db->proto_data.tcp_seq_chk.ack_no =
+ spp_net_to_host_byte_order_32(
+ &tcp->ack_num);
+ }
+ if (PREDICT_FALSE(db->diff_window < window)) {
+ /* Update the db entry with window option from packet */
+ db->diff_window = window;
+ }
+ if (PREDICT_FALSE(global_debug_flag && CNAT_DEBUG_GLOBAL_ALL)) {
+ PLATFORM_DEBUG_PRINT("\n In2out MainDB seq no = %u,"
+ "\n ack no = %u\n",
+ db->proto_data.tcp_seq_chk.seq_no,
+ db->proto_data.tcp_seq_chk.ack_no);
+ PLATFORM_DEBUG_PRINT("\n In2out MAINDB window = %u\n",
+ db->diff_window);
+ }
+ }
+update_pkt:
+
+ INCREMENT_NODE_COUNTER(np, TCP_NAT);
+
+ disposition = CNAT_REWRITE_OUTPUT;
+
+ /* NAT the packet and update checksum (increamental) */
+
+ /* If it is a non-first fragment, we need not worry about
+ * ALGs as the packet does not have TCP header..
+ * However, under a very race scenario when this non-first
+ * fragment is containing an FTP PORT command OR RTSP command
+ * we cannot handle that case.. in that case the ALG will fail
+ * Do not want to add a lot of complexity to handle one in million
+ * of such ALG case
+ */
+ u16 frag_offset =
+ spp_net_to_host_byte_order_16(&(ip->frag_flags_offset));
+
+ if(PREDICT_FALSE(frag_offset & IP_FRAG_OFFSET_MASK)) {
+ /* Non first fragment.. no TCP header */
+ FTP_ALG_DEBUG_PRINTF("Non first frag.. cannot handle ALG");
+ goto handle_ttl_n_checksum;
+ }
+
+ FTP_ALG_DEBUG_PRINTF("src port 0x%x, dst_port 0x%x",
+ spp_net_to_host_byte_order_16(&tcp->src_port),
+ spp_net_to_host_byte_order_16(&tcp->dest_port))
+
+ /* handle FTP ALG */
+ if (PREDICT_FALSE(ftp_alg_enabled &&
+ (spp_net_to_host_byte_order_16(&tcp->src_port) == 21 ||
+ spp_net_to_host_byte_order_16(&tcp->dest_port) == 21))) {
+
+ if(PREDICT_FALSE((db->flags & CNAT_DB_FLAG_PPTP_TUNNEL_ACTIVE) ||
+ (db->flags & CNAT_DB_FLAG_PPTP_TUNNEL_INIT)))
+ {
+ /* FTP on a PPTP Control session? Ignore FTP */
+ goto handle_ttl_n_checksum;
+ }
+
+ if (PREDICT_FALSE(tcp->flags & (TCP_FLAG_SYN | TCP_FLAG_RST |
+ TCP_FLAG_FIN))) {
+
+ FTP_ALG_DEBUG_PRINTF("SYN Case setting delta = 0")
+
+ /* reset the delta */
+ if(PREDICT_FALSE(session_db != NULL)) {
+ session_db->alg.delta = 0;
+ } else {
+ db->alg.delta = 0;
+ }
+
+ } else {
+
+ /* need to adjust seq # for in2out pkt if delta is not 0 */
+ if (PREDICT_TRUE((session_db && (session_db->alg.delta != 0))
+ || ((!session_db) && (db->alg.delta != 0)))) {
+ seq = net2host32(&tcp->seq_num);
+
+ FTP_ALG_DEBUG_PRINTF("Orig Seq Num 0x%x", seq)
+ /*
+ * for ftp packets, due to PORT command translation,
+ * we may have cases that a packet/payload len gets
+ * changed for tcp, we need to adjust the packet's
+ * sequence numbers to match the changes. The delta
+ * of orig pkt len and new len is in alg_dlt[1] together
+ * with the sequence number that cuased the delta. When
+ * there are multiple len changes, we keep theprevious
+ * delta in alg_dlt[0] for case like pkt retransmission.
+ * So depends on packet seq number, we decide to use
+ * either latest delta or previous delta ([0])
+ * We won't be here if both delta values are 0
+ */
+ if(PREDICT_FALSE(session_db != NULL)) {
+ seq1 = seq > session_db->tcp_seq_num ?
+ (seq + session_db->alg.alg_dlt[1]):
+ (seq + session_db->alg.alg_dlt[0]);
+ } else {
+ seq1 = seq > db->proto_data.seq_pcp.tcp_seq_num ?
+ (seq + db->alg.alg_dlt[1]):
+ (seq + db->alg.alg_dlt[0]);
+ }
+
+ FTP_ALG_DEBUG_PRINTF("Old_seq_num 0x%x New Seq Num 0x%x",
+ seq, seq1)
+
+ if (PREDICT_TRUE(seq1 != seq)) {
+
+ tcp->seq_num = host2net32(seq1);
+
+ FTP_ALG_DEBUG_PRINTF("Old TCP Checksum 0x%x",
+ net2host16(&tcp->tcp_checksum))
+
+ /*
+ * fix checksum incremental for seq # changes
+ * newchecksum = ~(~oldchecksum + ~old + new)
+ */
+ CNAT_UPDATE_TCP_SEQ_ACK_CHECKSUM(seq, seq1)
+ } /* There is a diff in seq */
+
+ } /* ALG Delta is non zero */
+
+ rc = cnat_ftp_alg((u8*) ip, &delta, db, dslite_entry_ptr, ipv6_hdr);
+
+ FTP_ALG_DEBUG_PRINTF("cnat_ftp_alg rc 0x%x", rc)
+
+ /*if located PORT cmd, packet being updated, take the delta and seq # */
+ if (PREDICT_FALSE(rc)) {
+
+ /* set alg flag for this ftp control connection */
+ if(PREDICT_FALSE(session_db != NULL)) {
+ session_db->flags |= CNAT_DB_FLAG_ALG_CTRL_FLOW;
+ } else {
+ db->flags |= CNAT_DB_FLAG_ALG_CTRL_FLOW;
+ }
+
+ /*
+ * rc != 0 indicates this packet has triggered a new pkt len delta
+ * we need to update db entry's seq# with seq# of this packet.
+ *
+ * Move alg_dlt[1] to [0], (current delta -> previous delta)
+ * then apply latest delta to alg_dlt[1] (keep [1] as latest delta)
+ */
+ if(PREDICT_FALSE(session_db != NULL)) {
+ session_db->tcp_seq_num = net2host32(&tcp->seq_num);
+ session_db->alg.alg_dlt[0] = session_db->alg.alg_dlt[1];
+
+ /* accumulate the delta ! */
+ session_db->alg.alg_dlt[1] += delta;
+ FTP_ALG_DEBUG_PRINTF(
+ "cnat_ftp_alg seq_num 0x%x, dlt0 0x%x, dlt1 0x%x",
+ session_db->tcp_seq_num,
+ session_db->alg.alg_dlt[0],
+ session_db->alg.alg_dlt[1])
+
+ } else {
+ db->proto_data.seq_pcp.tcp_seq_num = net2host32(&tcp->seq_num);
+ db->alg.alg_dlt[0] = db->alg.alg_dlt[1];
+
+ /* accumulate the delta ! */
+ db->alg.alg_dlt[1] += delta;
+
+ FTP_ALG_DEBUG_PRINTF(
+ "cnat_ftp_alg seq_num 0x%x, dlt0 0x%x, dlt1 0x%x",
+ db->proto_data.seq_pcp.tcp_seq_num,
+ db->alg.alg_dlt[0],
+ db->alg.alg_dlt[1])
+ }
+ ctx->current_length += delta;
+ }/* cnat_ftp_alg returned non zero */
+ } /* It is not a SYN, RST or FIN */
+ } else if (PREDICT_FALSE(rtsp_alg_port_num &&
+ ((spp_net_to_host_byte_order_16(&tcp->dest_port) == rtsp_alg_port_num) ||
+ (spp_net_to_host_byte_order_16(&tcp->src_port) == rtsp_alg_port_num))) ) {
+
+ if (PREDICT_FALSE(tcp->flags & (TCP_FLAG_SYN | TCP_FLAG_RST |
+ TCP_FLAG_FIN))) {
+
+ FTP_ALG_DEBUG_PRINTF("SYN Case setting delta = 0")
+
+ /* reset the delta */
+ if(PREDICT_FALSE(session_db != NULL)) {
+ session_db->alg.delta = 0;
+ } else {
+ db->alg.delta = 0;
+ }
+
+ } else {
+#define RTSP_ALG_DELTA_MASK 0xFF
+ /* need to adjust seq # for in2out pkt if delta is not 0 */
+ if (PREDICT_FALSE((session_db &&
+ (session_db->alg.delta & RTSP_ALG_DELTA_MASK) != 0) ||
+ ((!session_db) &&
+ (db->alg.delta & RTSP_ALG_DELTA_MASK) != 0))) {
+ seq = net2host32(&tcp->seq_num);
+
+ if(PREDICT_FALSE(session_db != NULL)) {
+ seq1 = seq > session_db->tcp_seq_num ?
+ (seq + db->alg.alg_dlt[1]):
+ (seq + db->alg.alg_dlt[0]);
+ } else {
+ seq1 = seq > db->proto_data.seq_pcp.tcp_seq_num ?
+ (seq + db->alg.alg_dlt[1]):
+ (seq + db->alg.alg_dlt[0]);
+ }
+
+ FTP_ALG_DEBUG_PRINTF("Old_seq_num 0x%x New Seq Num 0x%x",
+ seq, seq1)
+
+ if (PREDICT_TRUE(seq1 != seq)) {
+
+ tcp->seq_num = host2net32(seq1);
+
+ FTP_ALG_DEBUG_PRINTF("Old TCP Checksum 0x%x",
+ net2host16(&tcp->tcp_checksum))
+
+ /*
+ * fix checksum incremental for seq # changes
+ * newchecksum = ~(~oldchecksum + ~old + new)
+ */
+ CNAT_UPDATE_TCP_SEQ_ACK_CHECKSUM(seq, seq1)
+ }
+
+ }
+ }
+ if ((session_db && (!session_db->alg.il)) ||
+ ((!session_db) && (!db->alg.il))) {
+ cnat_rtsp_alg((u8*) ip,
+ &delta,
+ db,
+ ctx->current_length,
+ NULL,
+ NULL);
+ }
+ }
+handle_ttl_n_checksum:
+ if (PLATFORM_HANDLE_TTL_DECREMENT) {
+ /*
+ * Decrement TTL and update IPv4 checksum
+ */
+ ipv4_decr_ttl_n_calc_csum(ip);
+ }
+
+ tcp_in2out_nat_mss_n_checksum(ip,
+ tcp,
+ db->out2in_key.k.ipv4,
+ db->out2in_key.k.port,
+ db);
+/* CNAT_PPTP_ALG_SUPPORT */
+ /* code to handle pptp control msgs */
+ if(PREDICT_FALSE(
+ (spp_net_to_host_byte_order_16(&tcp->dest_port) ==
+ TCP_PPTP_PORT))) {
+
+ u32 ret;
+
+ PPTP_DBG(3, "PPTP mgmt/ctrl msg recieved");
+
+ ret = cnat_handle_pptp_msg(ctx, db , tcp, PPTP_PNS );
+
+ if( PREDICT_FALSE( ret != CNAT_SUCCESS) ) {
+ PPTP_DBG(3, "PPTP mgmt/ctrl msg drop");
+ disposition = CNAT_DROP;
+ PPTP_INCR(ctrl_msg_drops);
+ goto drop_pkt;
+ }
+ }
+
+/* CNAT_PPTP_ALG_SUPPORT */
+
+ /* update transaltion counters */
+ db->in2out_pkts++;
+
+ in2out_forwarding_count++;
+
+ PLATFORM_CNAT_SET_TX_VRF(ctx,db->out2in_key.k.vrf);
+
+ /* update the timer for good mode, or evil mode dst_ip match */
+
+// if (!address_dependent_filtering || fd->dbl.dst_ipv4 == db->dst_ipv4) {
+ if(PREDICT_FALSE(session_db != NULL)) {
+ V4_TCP_UPDATE_SESSION_DB_FLAG(session_db, tcp);
+ CNAT_DB_TIMEOUT_RST(session_db);
+ } else {
+ V4_TCP_UPDATE_SESSION_FLAG(db, tcp);
+ CNAT_DB_TIMEOUT_RST(db);
+ }
+
+// }
+
+ }
+
+ /* Pick up the answer and put it into the context */
+ fd->dbl.db_index = db_index;
+
+drop_pkt:
+
+ DISP_PUSH_CTX(np, ctx, disposition, disp_used, last_disposition, last_contexts_ptr, last_nused_ptr);
+
+}
+
diff --git a/vnet/vnet/vcgn/cnat_va_db.c b/vnet/vnet/vcgn/cnat_va_db.c
new file mode 100644
index 00000000000..7423bdf2de2
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_va_db.c
@@ -0,0 +1,286 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_va_db.c - virtual assembly database
+ *
+ * Copyright (c) 2009, 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cnat_va_db.h>
+#include <format.h>
+#include <spp_node.h>
+#include <spp_alloc.h>
+#include <spp_byteorder.h>
+#include <spp_main.h>
+#include <spp_cache.h>
+#include <spp_interface.h>
+#include <spp_api.h>
+#include <spp_client_api.h>
+#include <spp_timers.h>
+#include <cnat_db.h>
+#include <spp_plugin.h>
+#include <cnat_v4_functions.h>
+
+
+va_bucket_t va_bucket[VA_BUCKETS];
+
+void va_bucket_init () {
+
+ u32 i;
+
+ /*
+ * set the pointer in each bucket
+ * points to nowhere
+ */
+ for (i=0; i<VA_BUCKETS; i++) {
+ va_bucket[i].next_available_entry = ~0;
+ }
+
+}
+
+inline void va_db_add_new_entry (u32 bucket_index,
+ va_lookup_key * key )
+{
+
+ va_entry_t * entry_p;
+ u32 head, next;
+
+ entry_p = va_db_lookup(bucket_index, key);
+
+ if (PREDICT_FALSE(entry_p)) {
+ FRAG_DEBUG_PRINTF6(
+ "\nVA_ADD_NEW: Bucket %d fnd Existng entry [%d, %d] -> [%d, %d]\n",
+ bucket_index, entry_p->src_port,
+ entry_p->dst_port, key->e.src_port, key->e.dst_port)
+
+ /* found match entry, update it */
+ entry_p->src_port = key->e.src_port;
+ entry_p->dst_port = key->e.dst_port;
+
+ FRAG_DEBUG_PRINTF3("VA_ADD_NEW: Existing bucket %d, counter %d\n",
+ bucket_index,
+ va_bucket[bucket_index].new_entry_counter)
+
+ } else {
+
+ /* no match, add a new one */
+ head = va_bucket[bucket_index].head_entry;
+ next = va_bucket[bucket_index].next_available_entry;
+
+ FRAG_DEBUG_PRINTF5(
+ "\nVA_ADD_NEW: Filling bucket %d, index %d with key 0x%llx %x\n",
+ bucket_index, next, key->k.key64, key->k.key32)
+
+ va_bucket[bucket_index].va_entry[next] = key->e;
+
+ /* increase next pointer */
+ va_bucket[bucket_index].next_available_entry = (next+1) & VA_BUCKET_MASK;
+
+ if (PREDICT_FALSE(head == va_bucket[bucket_index].next_available_entry)) {
+ /* adjust head circular pointer */
+ va_bucket[bucket_index].head_entry = (head+1) & VA_BUCKET_MASK;
+ }
+
+ va_bucket[bucket_index].new_entry_counter++;
+
+ FRAG_DEBUG_PRINTF4(
+ "VA_ADD_NEW: NEW bucket %d, entry %d counter %d\n",
+ bucket_index, next, va_bucket[bucket_index].new_entry_counter)
+ }
+}
+
+
+/*
+ * use the key,
+ * return pointer to the entry if found,
+ * NULL if not
+ */
+
+inline
+va_entry_t * va_db_lookup (u32 bucket_index, va_lookup_key * key)
+{
+
+ u32 index, next;
+ va_entry_t * entry_p;
+ va_bucket_t * bucket;
+
+ bucket = &va_bucket[bucket_index];
+ index = bucket->head_entry;
+ next = bucket->next_available_entry;
+ entry_p = NULL;
+
+ FRAG_DEBUG_PRINTF4(
+ "\nVA_DB_LOOKUP: bucket index %d head %d next %d\n",
+ bucket_index, index, next)
+
+ /* loop through the entries in the bucket */
+ while( index != next) {
+
+ if(PREDICT_TRUE(memcmp(&bucket->va_entry[index], key, VA_KEY_SIZE)==0)) {
+
+ entry_p = &bucket->va_entry[index];
+ /*In add frag entry function we are again assigning key's src
+ port to entry_p's src port. So when a main DB entry is deleted/
+ timed out, and again another entry is created for the same
+ src ip and src port pair, the frag's entry_p will have the
+ previous port info stored and not updated. Hence the below
+ line is not required*/
+
+ /* *(u32*)&key->e.src_port = *(u32*)&entry_p->src_port; */
+ /* do two ports as u32 :) */
+
+ break;
+ }
+
+ index = (index +1) & VA_BUCKET_MASK;
+
+ }
+
+#ifdef FRAG_DEBUG
+ if (PREDICT_TRUE(entry_p)) {
+ FRAG_DEBUG_PRINTF3("VA_DB_LOOKUP: bucket index %d entry index %d\n",
+ bucket_index, index)
+ FRAG_DEBUG_PRINTF5("VA_DB_LOOKUP: SRC-->DST [0x%x, %d] [0x%x, %d]\n",
+ entry_p->src_ip, entry_p->src_port,
+ entry_p->dst_ip, entry_p->dst_port)
+ FRAG_DEBUG_PRINTF3("[vrf 0x%x, id 0x%x]\n",
+ entry_p->vrf, entry_p->ip_id)
+ } else {
+ FRAG_DEBUG_PRINTF1("\nNULL ENTRY\n")
+ }
+#endif
+
+ return entry_p;
+
+}
+
+inline
+int va_db_delete_entry (u32 bucket_index, va_lookup_key * key)
+{
+
+ u32 index, next;
+ int entry_found = 0;
+ va_bucket_t * bucket;
+
+ bucket = &va_bucket[bucket_index];
+ index = bucket->head_entry;
+ next = bucket->next_available_entry;
+
+ FRAG_DEBUG_PRINTF4(
+ "\nVA_DB_DELETE_ENTRY: bucket index %d head %d next %d\n",
+ bucket_index, index, next);
+
+ /* loop through the entries in the bucket */
+ while( index != next) {
+ if(PREDICT_TRUE(memcmp(&bucket->va_entry[index], key,
+ VA_KEY_SIZE)==0)) {
+ /* Clear the entry */
+ FRAG_DEBUG_PRINTF1("Entry found in delete API");
+ memset(&bucket->va_entry[index], 0, sizeof(va_entry_t));
+ entry_found = 1;
+ break;
+ }
+ index = (index +1) & VA_BUCKET_MASK;
+ }
+ return entry_found;
+}
+
+
+
+void cnat_va_bucket_used (int argc, unsigned long * argv)
+{
+
+ u32 i, sum = 0;;
+
+ for(i=0;i<VA_BUCKETS;i++) {
+
+ if(PREDICT_TRUE(va_bucket[i].new_entry_counter)) sum++;
+
+ }
+
+ if (PREDICT_FALSE(!sum)) {
+ printf("no bucket in use\n");
+ return;
+ }
+
+ printf("index head next counter (%d bucket in use)\n", sum);
+
+ for(i=0;i<VA_BUCKETS;i++) {
+
+ if (PREDICT_FALSE(!va_bucket[i].new_entry_counter)) continue;
+
+ printf(" %04d %04d %04d %d\n", i,
+ va_bucket[i].head_entry,
+ va_bucket[i].next_available_entry,
+ va_bucket[i].new_entry_counter);
+
+ }
+}
+
+void cnat_va_dump (int argc, unsigned long * argv)
+{
+
+ u32 i, sum, index ;
+
+ PLATFORM_DEBUG_PRINT("====== SUMMARY ======\n");
+ PLATFORM_DEBUG_PRINT("Total buckets: %d\n", VA_BUCKETS);
+ PLATFORM_DEBUG_PRINT("Entries per bucket: %d\n", VA_ENTRY_PER_BUCKET);
+
+ sum = 0;
+
+ for(i=0; i<VA_BUCKETS; i++) {
+ if (PREDICT_TRUE(va_bucket[i].new_entry_counter > 0)) sum ++;
+ }
+
+ PLATFORM_DEBUG_PRINT("buckets in use: %d\n", sum);
+
+ sum = 0;
+ for(i=0; i<VA_BUCKETS; i++) {
+
+ if ( PREDICT_FALSE(((va_bucket[i].next_available_entry+1) & VA_BUCKET_MASK)
+ == va_bucket[i].head_entry)) {
+
+ sum ++;
+ }
+ }
+
+ PLATFORM_DEBUG_PRINT("bucket full: %d\n", sum);
+
+ /* dump per bucket info */
+
+ if (argc == 0 ) return;
+
+ index = (u32) argv[0];
+
+ if (PREDICT_FALSE(index >= VA_BUCKETS)) {
+ PLATFORM_DEBUG_PRINT("invalid bucket index %d\n", index);
+ return;
+ }
+
+ PLATFORM_DEBUG_PRINT("\n====== Bucket %d ======\n", index);
+
+ PLATFORM_DEBUG_PRINT("bucket head index %d\n", va_bucket[index].head_entry);
+
+ PLATFORM_DEBUG_PRINT("bucket next index %d\n", va_bucket[index].next_available_entry);
+
+ PLATFORM_DEBUG_PRINT(" source IP dest IP VRF ip-id srcP dstP\n");
+
+ for(i=0;i<VA_ENTRY_PER_BUCKET;i++) {
+ hex_dump((u8*)&va_bucket[index].va_entry[i], sizeof(va_entry_t));
+ }
+
+}
diff --git a/vnet/vnet/vcgn/cnat_va_db.h b/vnet/vnet/vcgn/cnat_va_db.h
new file mode 100644
index 00000000000..6e0051b46f7
--- /dev/null
+++ b/vnet/vnet/vcgn/cnat_va_db.h
@@ -0,0 +1,121 @@
+/*
+ *------------------------------------------------------------------
+ * cnat_va_db.h - definition for virtual assembly database
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __CNAT_VA_DB_H__
+#define __CNAT_VA_DB_H__
+
+#include <clib_lite.h>
+
+#define FRAG_DEBUG 1
+
+/* virtual assemble hash database size ~ 16B x 64K = 1MB */
+
+#define VA_TOTAL_ENTRIES (64*1024)
+#define VA_ENTRY_PER_BUCKET (8) /* make sure size is power of 2 for circular FIFO */
+#define VA_BUCKET_MASK (VA_ENTRY_PER_BUCKET -1)
+#define VA_BUCKETS (VA_TOTAL_ENTRIES / VA_ENTRY_PER_BUCKET)
+#define VA_KEY_SIZE 12
+
+typedef struct _va_entry {
+ /* key: top 12 bytes */
+ u32 src_ip;
+ u32 dst_ip;
+ u16 vrf; /* overloaded with protocol info with top two bits */
+ u16 ip_id;
+
+ /* values */
+ u16 src_port;
+ u16 dst_port;
+} va_entry_t;
+
+typedef struct _va_keys {
+ u64 key64; /* src & dst IP */
+ u32 key32; /* vrf, protocol and ip_id */
+} va_keys;
+
+typedef union {
+ va_entry_t e;
+ va_keys k;
+} va_lookup_key;
+
+typedef struct _va_bucket_t {
+ u32 head_entry;
+ u32 next_available_entry; /* ~0 for empty bucket */
+ u32 new_entry_counter; /* for debug purpose */
+ va_entry_t va_entry[VA_ENTRY_PER_BUCKET];
+} va_bucket_t;
+
+extern va_bucket_t va_bucket[]; /* hash table in cnat_va_db.c */
+
+void va_bucket_init ();
+
+inline void va_db_add_new_entry (u32 bucket_index, va_lookup_key * );
+inline int va_db_delete_entry (u32 bucket_index, va_lookup_key * );
+inline va_entry_t * va_db_lookup (u32 bucket_index, va_lookup_key * key);
+
+#ifdef FRAG_DEBUG
+
+#define FRAG_DEBUG_PRINTF1(a) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a); \
+ }
+
+#define FRAG_DEBUG_PRINTF2(a, b) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a, b); \
+ }
+
+#define FRAG_DEBUG_PRINTF3(a, b, c) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a, b, c); \
+ }
+
+#define FRAG_DEBUG_PRINTF4(a, b, c, d) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a, b, c, d); \
+ }
+
+#define FRAG_DEBUG_PRINTF5(a, b, c, d, e) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a, b, c, d, e); \
+ }
+
+#define FRAG_DEBUG_PRINTF6(a, b, c, d, e, f) \
+ if (frag_debug_flag) { \
+ PLATFORM_DEBUG_PRINT(a, b, c, d, e, f); \
+ }
+#else
+
+#define FRAG_DEBUG_PRINTF1(a)
+
+#define FRAG_DEBUG_PRINTF2(a, b)
+
+#define FRAG_DEBUG_PRINTF3(a, b, c)
+
+#define FRAG_DEBUG_PRINTF4(a, b, c, d)
+
+#define FRAG_DEBUG_PRINTF5(a, b, c, d, e)
+
+#define FRAG_DEBUG_PRINTF6(a, b, c, d, e, f)
+
+#endif
+
+#endif /* __CNAT_VA_DB_H__ */
+
+
diff --git a/vnet/vnet/vcgn/dslite_db.h b/vnet/vnet/vcgn/dslite_db.h
new file mode 100644
index 00000000000..2269b98c989
--- /dev/null
+++ b/vnet/vnet/vcgn/dslite_db.h
@@ -0,0 +1,170 @@
+/*
+ *------------------------------------------------------------------
+ * dslite_db.h - Stateful DSLITE translation database definitions
+ *
+ * Copyright (c) 2010-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#ifndef __DSLITE_DB_H__
+#define __DSLITE_DB_H__
+
+#include "cnat_cli.h"
+#include "index_list.h"
+#include "cnat_ports.h"
+#include "cnat_db.h"
+#include "dslite_defs.h"
+
+#define DSLITE_PRINTF(level, ...) \
+ if (dslite_debug_level > level) PLATFORM_DEBUG_PRINT(__VA_ARGS__);
+/*
+#define DSLITE_PRINTF(lvl, ...) \
+{ \
+ avsm_dispatlib_debug (__VA_ARGS__); \
+}
+*/
+
+#define HASH_ENHANCE 4
+//#define DSLITE_DEF
+#define DSLITE_MAIN_DB_SIZE (20000000 / PLATFORM_CNAT_INSTS)
+#define DSLITE_MAIN_HASH_SIZE \
+ (HASH_ENHANCE * PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE)
+
+#define DSLITE_MAIN_HASH_MASK (DSLITE_MAIN_HASH_SIZE-1)
+
+
+/* nb: 200000 users / 64 CNAT = 3125, 76% occupancy */
+#define DSLITE_USER_HASH_SIZE CNAT_USER_HASH_SIZE
+#define DSLITE_USER_HASH_MASK (DSLITE_USER_HASH_SIZE-1)
+
+/* No. of per ip/port config will be limited to 1000 */
+#define DSLITE_TIMEOUT_HASH_SIZE 1000
+#define DSLITE_TIMEOUT_HASH_MASK (DSLITE_TIMEOUT_HASH_SIZE - 1)
+#define DSLITE_TIMEOUT_FULL_MASK 0xFFFFFFFFFFFFFFFF
+
+#define CNAT_MAX_SESSIONS_PER_BIB 0xFFFF
+
+#define FORCE_DEL 1 /* Delete static BIB entries as well */
+
+/* default timeout values */
+#define DSLITE_UDP_DEFAULT 300 /* 5 min */
+#define DSLITE_UDP_MIN 120 /* 2 min */
+#define DSLITE_TCP_TRANS 240 /* 4 min */
+#define DSLITE_TCP_EST 7200 /* 2 hrs */
+#define DSLITE_TCP_V4_SYN 6 /* 6 sec */
+#define DSLITE_FRAG_MIN 2 /* 2 sec */
+#define DSLITE_ICMP_DEFAULT 60 /* 1 min */
+
+extern u32 dslite_translation_create_count;
+extern u32 dslite_translation_delete_count;
+extern u32 dslite_translation_create_rate;
+extern u32 dslite_translation_delete_rate;
+extern u32 dslite_in2out_forwarding_count;
+extern u32 dslite_in2out_forwarding_rate;
+extern u32 dslite_out2in_forwarding_count;
+extern u32 dslite_out2in_forwarding_rate;
+
+#define DSLITE_V6_GET_HASH(in_key, hash, mask) \
+ a = in_key->ipv6[0] ^ in_key->ipv6[1] ^ in_key->ipv6[2] ^ in_key->ipv6[3] \
+ ^ in_key->ipv4_key.k.ipv4 ^ ((in_key->ipv4_key.k.port << 16) | in_key->ipv4_key.k.vrf); \
+ DSLITE_PRINTF(1, "%x:%x:%x:%x:%x:%x:%x\n", in_key->ipv6[0], in_key->ipv6[1], in_key->ipv6[2], in_key->ipv6[3], \
+ in_key->ipv4_key.k.ipv4, in_key->ipv4_key.k.port, in_key->ipv4_key.k.vrf); \
+ b = c = 0x9e3779b9;\
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask; \
+
+
+#define DSLITE_V6_GET_USER_HASH(ipv6, hash, mask) \
+ a = ipv6[0] ^ ipv6[1] ^ ipv6[2] ^ ipv6[3]; \
+ b = c = 0x9e3779b9;\
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask; \
+
+#define DSLITE_V4_GET_HASH(in_key, hash, mask) \
+ a = in_key.ipv4 ^ ((in_key.port << 16) | in_key.vrf); \
+ b = c = 0x9e3779b9; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask;
+
+#define PRIVATE_V4_ADDR_CHECK(addr, invalid) \
+ invalid = 0; \
+ int range1 = ((addr & 0xFF000000) >> 24); \
+ int range2 = ((addr & 0xFFF00000) >> 20); \
+ int range3 = ((addr & 0xFFFF0000) >> 16); \
+ int range4 = ((addr & 0xFFFFFFF8) >> 3); \
+ if(range1 != 0xa && range2 != 0xac1 && range3 != 0xc0a8 && range4 != 0x18000000) \
+ invalid = 1;
+
+#define V4_MAPPED_V6_CHECK(v6_addr, invalid) \
+ invalid = 0; \
+ int word1 = v6_addr[0]; \
+ int word2 = v6_addr[1]; \
+ int word3 = v6_addr[2]; \
+ if(!((word1 == 0) && (word2 == 0) && (word3 == 0x0000FFFF))) \
+ invalid = 1;
+
+
+extern dslite_table_entry_t dslite_table_array[DSLITE_MAX_DSLITE_ENTRIES];
+extern dslite_table_entry_t *dslite_table_ptr;
+
+#define DSLITE_CMP_V6_KEY(key1, key2) \
+ memcmp(key1, key2, sizeof(dslite_v6_key_t))
+
+#define DSLITE_CMP_V4_KEY(key1, key2) \
+ memcmp(key1, key2, sizeof(dslite_v4_key_t))
+
+
+#define DSLITE_CMP_V6_IP(ip1, ip2) \
+ memcmp(ip1, ip2, (sizeof(u32) * 4))
+
+
+#define DSLITE_CMP_V6_KEY1(key1, key2) \
+ (key1.ipv6[0] == key2.ipv6[0]) && (key1.ipv6[1] == key2.ipv6[1]) && \
+ (key1.ipv6[2] == key2.ipv6[2]) && (key1.ipv6[3] == key2.ipv6[3]) && \
+ (key1.port == key2.port) && (key1.vrf == key2.vrf)
+
+
+#define DSLITE_CMP_V6_IP1(ip1, ip2) \
+ ((ip1[0] == ip2[0]) && (ip1[1] == ip2[1]) && \
+ (ip1[2] == ip2[2]) && (ip1[3] == ip2[3]))
+
+#define DSLITE_CMP_V4_KEY1(key1, key2) \
+ (key1.key64 == key2.key64)
+
+cnat_main_db_entry_t*
+dslite_get_main_db_entry_v2(dslite_db_key_bucket_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info,
+ dslite_table_entry_t *dslite_entry_ptr,
+ cnat_key_t *dest_info);
+
+cnat_main_db_entry_t*
+dslite_main_db_lookup_entry(dslite_db_key_bucket_t *ki);
+
+
+cnat_user_db_entry_t*
+dslite_user_db_lookup_entry(dslite_db_key_bucket_t *uki);
+
+cnat_user_db_entry_t*
+dslite_user_db_create_entry(dslite_db_key_bucket_t *uki, u32 portmap_index);
+
+cnat_main_db_entry_t*
+dslite_create_main_db_entry_and_hash(dslite_db_key_bucket_t *ki,
+ cnat_db_key_bucket_t *ko,
+ cnat_user_db_entry_t *udb);
+
+#endif
diff --git a/vnet/vnet/vcgn/dslite_defs.h b/vnet/vnet/vcgn/dslite_defs.h
new file mode 100644
index 00000000000..4860adcb77d
--- /dev/null
+++ b/vnet/vnet/vcgn/dslite_defs.h
@@ -0,0 +1,336 @@
+/*
+ *------------------------------------------------------------------
+ * dslite_defs.h - DSLITE structure definiitions
+ *
+ * Copyright (c) 2011-2012 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __DSLITE_DEFS_H__
+#define __DSLITE_DEFS_H__
+
+#ifdef TOBE_PORTED
+#include "spp_platform_common.h"
+#include "cgse_defs.h"
+#endif
+#include "cnat_cli.h"
+#include "cnat_config.h"
+#include "cnat_ports.h"
+#include "cnat_bulk_port_defs.h"
+
+extern u32 ds_lite_config_debug_level;
+
+#define SWAP_IPV6_ADDR(ipv6_hdr, dslite_entry_ptr) \
+ ipv6_hdr->dst_addr[0] = ipv6_hdr->src_addr[0]; \
+ ipv6_hdr->dst_addr[1] = ipv6_hdr->src_addr[1]; \
+ ipv6_hdr->dst_addr[2] = ipv6_hdr->src_addr[2]; \
+ ipv6_hdr->dst_addr[3] = ipv6_hdr->src_addr[3]; \
+ ipv6_hdr->src_addr[0] = spp_host_to_net_byte_order_32(dslite_entry_ptr->AFTR_v6_address[0]); \
+ ipv6_hdr->src_addr[1] = spp_host_to_net_byte_order_32(dslite_entry_ptr->AFTR_v6_address[1]); \
+ ipv6_hdr->src_addr[2] = spp_host_to_net_byte_order_32(dslite_entry_ptr->AFTR_v6_address[2]); \
+ ipv6_hdr->src_addr[3] = spp_host_to_net_byte_order_32(dslite_entry_ptr->AFTR_v6_address[3]);
+
+#define DSLITE_SET_TX_PKT_TYPE(type) { \
+ ctx->ru.tx.packet_type = type; \
+}
+
+#define DSLITE_INC_STATS_V4(PTR, COUNTER, IPV4_SRC_ADDR) { \
+ PTR->COUNTER++; \
+}
+
+#define DSLITE_INC_STATS_V6(PTR, COUNTER, IPV6_DEST_ADDR) { \
+ PTR->COUNTER++; \
+}
+
+
+#define DSLITE_INVALID_UIDX 0xffff /*invalid svi app uidb index */
+#define DSLITE_INVALID_VRFID 0xffffffff /*invalid vrf id */
+
+#define DSLITE_VRF_MASK 0x3fff
+#define DSLITE_MAX_VRFMAP_ENTRIES (DSLITE_VRF_MASK + 1)
+
+#define DSLITE_VRFMAP_ENTRY_INVALID 0xffff
+
+#define DSLITE_V6_PREFIX_MASK_MIN 16
+#define DSLITE_V6_PREFIX_MASK_MAX 96
+#define DSLITE_V6_PREFIX_MASK_MULTIPLE 8
+
+#define DSLITE_TUNNEL_MTU_MIN 1280
+#define DSLITE_TUNNEL_MTU_MAX 9216
+
+#define DSLITE_TUNNEL_TTL_MIN 0
+#define DSLITE_TUNNEL_TTL_MAX 255
+
+#define DSLITE_TUNNEL_TOS_MIN 0
+#define DSLITE_TUNNEL_TOS_MAX 255
+
+#define DSLITE_V4_MASK_MAX 32
+
+//#define XLAT_MAX_FRAG_ID_COUNTERS (256)
+#define DSLITE_AFTR_IPV4_ADDR 0xC0000001
+
+#define DSLITE_MAX_TAP_RG_ENTRIES 2
+#define DSLITE_MAX_DSLITE_ENTRIES (256)
+#define DSLITE_MAX_DSLITE_ID (DSLITE_MAX_DSLITE_ENTRIES-1)
+/* Define the below value as 64 if first 64 entries are for NAT44 */
+#define DSLITE_INDEX_OFFSET 1
+
+#define DSLITE_INVALID_DSLITE_ID (0)
+
+#define DSLITE_TABLE_ENTRY_DELETED 0
+#define DSLITE_TABLE_ENTRY_ACTIVE 1
+#define DSLITE_TABLE_ENTRY_DORMANT 2
+#define DSLITE_TABLE_ENTRY_INVALID_UIDB 3
+
+typedef struct {
+ u16 tcp_initial_setup_timeout;
+ u16 tcp_active_timeout;
+ u16 udp_init_session_timeout;
+ u16 udp_act_session_timeout;
+ u16 icmp_session_timeout;
+ u16 temp;
+} dslite_timeout_info_t;
+
+
+typedef struct {
+
+ u16 state; /* To use nat44 enums ?? TBD */
+ u16 dslite_id; /* DSLITE_ID value for this table entry - for easy access */
+
+ u16 i_vrf; /* V6 uidb index */
+ u16 o_vrf; /* V4 uidb index */
+
+ u16 cnat_main_db_max_ports_per_user; /* port limit */
+ u16 tcp_mss; /*tcp max segment size for this inside vrf */
+
+ u32 delete_time;
+
+ cnat_portmap_v2_t *portmap_list;
+
+ u32 nfv9_logging_index;
+ u32 syslog_logging_index;
+ u32 AFTR_v6_address[4];
+
+#define DSLITE_IPV4_TOS_OVERRIDE_FLAG 0x00000001
+#define DSLITE_IPV6_TOS_OVERRIDE_FLAG 0x00000002
+#define DSLITE_IPV4_TTL_OVERRIDE_FLAG 0x00000004
+#define DSLITE_IPV6_TTL_OVERRIDE_FLAG 0x00000008
+#define DSLITE_IPV6_FRAG_REASSEMB_ENG 0x00000010
+#define DSLITE_FTP_ALG_ENABLE 0x00000020
+#define DSLITE_RTSP_ALG_ENABLE 0x00000040
+#define DSLITE_NETFLOW_ENABLE 0x00000080
+#define DSLITE_SYSLOG_ENABLE 0x00000100
+
+ u16 feature_flags;
+ u16 tunnel_mtu;
+
+ u8 ipv4_ttl_value;
+ u8 ipv6_ttl_value;
+ u8 ipv4_tos_value;
+ u8 ipv6_tos_value;
+
+ u32 v4_if_num; /* V4 SVI ifnum */
+ u32 v6_if_num; /* V6 SVI ifnum */
+ u32 i_vrf_id; //inside vrf id
+ u32 o_vrf_id; //outside vrf id
+
+ dslite_timeout_info_t timeout_info;
+ u16 cnat_static_port_range;
+ u16 dyn_start_port;
+
+ u32 AFTR_v4_addr;
+ bulk_alloc_size_t bulk_size; /* should be equivalent to u16 - 2 bytes */
+ u32 pcp_server_addr;
+ u16 pcp_server_port;
+ u8 mapping_refresh_both_direction;
+ u8 pad;
+ u16 rtsp_port;
+#define DSLITE_BIDIR_REFRESH 1
+ u8 dslite_enable; /* DS-Lite enable check flag */
+ u8 syslog_logging_policy; /* DS-Lite Session Logging check flag */
+ u8 nf_logging_policy;
+
+ u8 temp1;
+ u16 temp2;
+ u32 temp3;
+ u32 rseed_ip;
+} dslite_table_entry_t;
+
+typedef struct {
+ u64 v4_to_v6_invalid_uidb_drop_count;
+ u64 v6_to_v4_invalid_uidb_drop_count;
+ u64 v4_to_v6_frag_invalid_uidb_drop_count;
+} dslite_global_counters_t;
+
+typedef struct {
+ u32 tap_enable;
+ u32 ipv4_addr;
+ u32 ipv6_addr[4];
+} dslite_tap_rg_t;
+
+extern dslite_table_entry_t *dslite_table_db_ptr;
+
+
+#define DSLITE_ADD_UIDB_INDEX_DSLITE_ID_MAPPING(uidb_index, dslite_id) \
+ *(cgse_uidb_index_cgse_id_mapping_ptr + uidb_index) = dslite_id;
+
+extern u8 my_instance_number;
+
+extern void dslite_clear_counters(u16 dslite_id);
+extern void dslite_clear_per_RG_counters();
+extern dslite_global_counters_t dslite_global_counters;
+extern u32 dslite_config_debug_level;
+extern u32 dslite_data_path_debug_level;
+extern u32 dslite_defrag_debug_level;
+extern u32 dslite_debug_level;
+
+typedef struct {
+ u64 v6_to_v4_tcp_input_count;
+ u64 v6_to_v4_tcp_nat_error;
+ u64 v6_to_v4_tcp_output_count;
+} dslite_v6_to_v4_tcp_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_tcp_input_count;
+ u64 v4_to_v6_tcp_no_entry;
+ u64 v4_to_v6_tcp_output_count;
+} dslite_v4_to_v6_tcp_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_udp_input_count;
+ u64 v6_to_v4_udp_nat_error;
+ u64 v6_to_v4_udp_output_count;
+} dslite_v6_to_v4_udp_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_udp_input_count;
+ u64 v4_to_v6_udp_no_entry;
+ u64 v4_to_v6_udp_output_count;
+} dslite_v4_to_v6_udp_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_icmp_qry_input_count;
+ u64 v6_to_v4_icmp_qry_nat_error;
+ u64 v6_to_v4_icmp_qry_output_count;
+} dslite_v6_to_v4_icmp_qry_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_icmp_qry_input_count;
+ u64 v4_to_v6_icmp_qry_no_nat_entry;
+ u64 v4_to_v6_icmp_qry_output_count;
+} dslite_v4_to_v6_icmp_qry_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_icmp_error_input_count;
+ u64 v6_to_v4_icmp_error_nat_error;
+ u64 v6_to_v4_icmp_error_output_count;
+} dslite_v6_to_v4_icmp_error_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_icmp_error_input_count;
+ u64 v4_to_v6_icmp_error_no_nat_entry;
+ u64 v4_to_v6_icmp_error_output_count;
+} dslite_v4_to_v6_icmp_error_counter_t;
+
+typedef struct {
+ u64 v6_icmp_error_input_count;
+ u64 v6_AFTR_echo_reply_count;
+ u64 v6_to_v4_icmp_error_unsupported_type_drop_count;
+ u64 v6_to_v4_icmp_error_no_db_entry_count;
+ u64 v6_to_v4_icmp_err_throttled_count;
+ u64 v6_to_v4_icmp_error_xlated_count;
+} dslite_v6_icmp_error_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_ttl_gen_count;
+ u64 v4_to_v6_icmp_throttle_count;
+ u64 v4_to_v6_ptb_gen_count;
+ u64 v4_to_v6_aftr_v4_echo_reply_count;
+ u64 v6_to_v4_ttl_gen_count;
+ u64 v6_to_v4_icmp_throttle_count;
+ u64 v6_to_v4_admin_prohib_icmp_count;
+ u64 v6_to_v4_aftr_v4_echo_reply_count;
+ u64 v6_icmp_gen_count;
+} dslite_icmp_gen_counter_t;
+
+typedef struct {
+ u64 dslite_input_tunnel_pkt;
+ u64 dslite_encap_count;
+ u64 dslite_decap_count;
+ u64 dslite_sec_check_failed;
+ u64 dslite_unsupp_packet;
+} dslite_common_counter_t;
+
+typedef struct {
+
+ dslite_v6_to_v4_tcp_counter_t v64_tcp_counters;
+ dslite_v4_to_v6_tcp_counter_t v46_tcp_counters;
+ dslite_v6_to_v4_udp_counter_t v64_udp_counters;
+ dslite_v4_to_v6_udp_counter_t v46_udp_counters;
+ dslite_v6_to_v4_icmp_qry_counter_t v64_icmp_counters;
+ dslite_v4_to_v6_icmp_qry_counter_t v46_icmp_counters;
+ dslite_v6_to_v4_icmp_error_counter_t v64_icmp_error_counters;
+ dslite_v4_to_v6_icmp_error_counter_t v46_icmp_error_counters;
+ dslite_v6_icmp_error_counter_t dslite_v6_icmp_err_counters;
+ dslite_icmp_gen_counter_t dslite_icmp_gen_counters;
+ dslite_common_counter_t dslite_common_counters;
+} dslite_counters_t;
+
+typedef struct {
+ u32 active_translations;
+ u32 translation_create_rate;
+ u32 translation_delete_rate;
+ u32 in2out_forwarding_rate;
+ u32 out2in_forwarding_rate;
+ u32 in2out_drops_port_limit_exceeded;
+ u32 in2out_drops_system_limit_reached;
+ u32 in2out_drops_resource_depletion;
+ u32 no_translation_entry_drops;
+ u32 pool_address_totally_free;
+ u32 num_subscribers;
+ u32 dummy;
+ u64 drops_sessiondb_limit_exceeded;
+} dslite_common_stats_t;
+
+typedef struct {
+ u16 msg_id;
+ u8 rc;
+ u8 pad[5];
+ dslite_counters_t counters;
+} dslite_show_statistics_summary_resp;
+
+
+#define CMD_GENERATE_PTB 0x1
+#define CMD_GENERATE_TTL 0x2
+
+/*
+ * This structure is to provide abstraction for data exchanged from one
+ * VPP node to its disposition or further in the dslite node graph.
+ */
+typedef struct {
+ u32 icmp_gen_type; // ctx->feature_data[0]
+ u32 reserved1; // ctx->feature_data[1]
+ u32 reserved2; // ctx->feature_data[2]
+ u32 reserved3; // ctx->feature_data[3]
+} dslite_feature_data_t;
+
+extern dslite_counters_t dslite_all_counters[DSLITE_MAX_DSLITE_ENTRIES];
+//extern dslite_inst_gen_counter_t dslite_inst_gen_counters[DSLITE_MAX_DSLITE_ENTRIES];
+
+
+ extern void dslite_show_config(void);
+#define STAT_PORT_RANGE_FROM_INST_PTR(inst) ((inst)->cnat_static_port_range)
+
+#endif /* __DSLITE_DEFS_H__ */
+
diff --git a/vnet/vnet/vcgn/index_list.c b/vnet/vnet/vcgn/index_list.c
new file mode 100644
index 00000000000..ec1b83b0b30
--- /dev/null
+++ b/vnet/vnet/vcgn/index_list.c
@@ -0,0 +1,336 @@
+/*
+ *------------------------------------------------------------------
+ * index_list.c - vector-index-based lists. 64-bit pointers suck.
+ *
+ * Copyright (c) 2008-2009, 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <string.h>
+//#include <clib_lite.h>
+#include <vppinfra/vec.h>
+#include "index_list.h"
+
+/*
+ * index_slist_addhead
+ *
+ * args: headp -- pointer to e.g. a hash bucket
+ * vector -- vector containing the list
+ * elsize -- size of an element in this vector
+ * offset -- offset in each vector element of this list thread
+ * index_to_add -- index in the vector to add to the list
+ *
+ * Adds new items to the head of the list. Try not to screw up the args!
+ */
+void index_slist_addhead (index_slist_t *headp,
+ u8 *vector, u32 elsize, u32 offset, u32 index_to_add)
+{
+ return (index_slist_addhead_inline(headp, vector, elsize, offset,
+ index_to_add));
+}
+
+/*
+ * index_slist_remelem
+ *
+ * args: headp -- pointer to e.g. a hash bucket
+ * vector -- vector containing the list
+ * elsize -- size of an element in this vector
+ * offset -- offset in each vector element of this list thread
+ * index_to_del -- index in the vector to delete from the list
+ *
+ * Try not to screw up the args!
+ */
+
+int index_slist_remelem (index_slist_t *headp,
+ u8 *vector, u32 elsize, u32 offset,
+ u32 index_to_delete)
+{
+ return (index_slist_remelem_inline(headp, vector, elsize, offset,
+ index_to_delete));
+}
+
+
+/*
+ * index_dlist_addtail
+ *
+ * Append the indicated vector element to the doubly-linked list
+ * whose first element is pointed to by headp.
+ *
+ * args: head_index -- listhead vector element index.
+ * vector -- vector containing the list
+ * elsize -- size of an element in this vector
+ * offset -- offset in each vector element of this list thread
+ * index_to_add -- index in the vector to add to the list
+ *
+ * Do not call this routine to create the listhead. Simply set
+ * index_dlist->next = index_dlist->prev = index of item.
+ *
+ * Try not to screw up the args.
+ */
+
+void index_dlist_addtail (u32 head_index, u8 *vector, u32 elsize,
+ u32 offset, u32 index_to_add)
+{
+ index_dlist_t *elp;
+ index_dlist_t *elp_next;
+ index_dlist_t *headp;
+
+ headp = (index_dlist_t *)(vector + offset + elsize*head_index);
+ elp = (index_dlist_t *)(vector + offset + elsize*index_to_add);
+ elp->next = index_to_add;
+ elp->prev = index_to_add;
+
+ elp->next = headp->next;
+ headp->next = index_to_add;
+
+ elp_next = (index_dlist_t *)(vector + offset + elsize*elp->next);
+ elp->prev = elp_next->prev;
+ elp_next->prev = index_to_add;
+}
+
+u32 index_dlist_remelem (u32 head_index,
+ u8 *vector, u32 elsize, u32 offset,
+ u32 index_to_delete)
+{
+ u32 rv = head_index;
+ index_dlist_t *headp, *elp, *elp_next;
+
+ elp = (index_dlist_t *)(vector + offset + elsize*index_to_delete);
+
+ /* Deleting the head index? */
+ if (PREDICT_FALSE(head_index == index_to_delete)) {
+ rv = elp->next;
+ /* The only element on the list? */
+ if (PREDICT_FALSE(rv == head_index))
+ rv = EMPTY;
+ }
+
+ headp = (index_dlist_t *)(vector + offset + elsize*elp->prev);
+ headp->next = elp->next;
+ elp_next = (index_dlist_t *)(vector + offset + elsize*elp->next);
+ elp_next->prev = elp->prev;
+
+ elp->next = elp->prev = EMPTY;
+
+ return rv;
+}
+
+
+#ifdef TEST_CODE2
+
+typedef struct tv_ {
+ char junk[43];
+ index_dlist_t l;
+} tv_t;
+
+
+void index_list_test_cmd(int argc, unsigned long *argv)
+{
+ int i, j;
+ u32 head_index;
+ index_dlist_t *headp;
+ tv_t *tp=0;
+
+ vec_validate(tp, 3);
+ head_index = 3;
+
+ memset(tp, 0xa, sizeof(tp[0])*vec_len(tp));
+
+ /* Here's how to set up the head element... */
+ headp = &((tp + head_index)->l);
+ headp->next = headp->prev = head_index;
+
+ for (i = 0; i < 3; i++) {
+ index_dlist_addtail(head_index, (u8 *)tp, sizeof(tp[0]),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+ printf("headp next %d prev %d\n",
+ headp->next, headp->prev);
+ for (j = 0; j <= 3; j++) {
+ printf ("[%d]: next %d prev %d\n", j,
+ tp[j].l.next, tp[j].l.prev);
+ }
+ printf("---------------\n");
+
+ }
+
+ printf("After all adds:\n");
+
+ printf("headp next %d prev %d\n",
+ headp->next, headp->prev);
+
+ for (j = 0; j <= 3; j++) {
+ printf ("[%d]: next %d prev %d\n", j,
+ tp[j].l.next, tp[j].l.prev);
+ }
+ printf("---------------\n");
+
+ head_index = index_dlist_remelem (head_index, (u8 *)tp, sizeof(tp[0]),
+ STRUCT_OFFSET_OF(tv_t, l), 1);
+
+ printf("after delete 1, head index %d\n", head_index);
+ headp = &((tp + head_index)->l);
+ printf("headp next %d prev %d\n",
+ headp->next, headp->prev);
+ for (j = 0; j <= 3; j++) {
+ printf ("[%d]: next %d prev %d\n", j,
+ tp[j].l.next, tp[j].l.prev);
+ }
+ printf("---------------\n");
+
+ index_dlist_addtail(head_index, (u8 *)tp, sizeof(tp[0]),
+ STRUCT_OFFSET_OF(tv_t, l), 1);
+
+ printf("after re-add 1, head index %d\n", head_index);
+ headp = &((tp + head_index)->l);
+ printf("headp next %d prev %d\n",
+ headp->next, headp->prev);
+ for (j = 0; j <= 3; j++) {
+ printf ("[%d]: next %d prev %d\n", j,
+ tp[j].l.next, tp[j].l.prev);
+ }
+ printf("---------------\n");
+
+ for (i = 3; i >= 0; i--) {
+ head_index = index_dlist_remelem (head_index, (u8 *)tp, sizeof(tp[0]),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+ printf("after delete, head index %d\n", head_index);
+ if (head_index != EMPTY) {
+ headp = &((tp + head_index)->l);
+ printf("headp next %d prev %d\n",
+ headp->next, headp->prev);
+ for (j = 0; j <= 3; j++) {
+ printf ("[%d]: next %d prev %d\n", j,
+ tp[j].l.next, tp[j].l.prev);
+ }
+ } else {
+ printf("empty list\n");
+ }
+ printf("---------------\n");
+ }
+}
+#endif /* test code 2 */
+
+#ifdef TEST_CODE
+
+typedef struct tv_ {
+ char junk[43];
+ index_slist_t l;
+} tv_t;
+
+
+void index_list_test_cmd(int argc, unsigned long *argv)
+{
+ int i, j;
+ tv_t *tp = 0;
+ index_slist_t *buckets = 0;
+
+ vec_add1((u32 *)buckets, EMPTY);
+ vec_validate(tp, 9);
+
+ for (i = 0; i < 10; i++) {
+ index_slist_addhead(buckets, (u8 *)tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+ }
+
+ printf ("after adds, buckets[0] = %u\n", buckets[0]);
+
+ for (j = 0; j < 10; j++) {
+ printf("tp[%d] next %u\n", j, tp[j].l);
+
+ }
+
+ for (i = 0; i < 10; i++) {
+ if (PREDICT_FALSE(index_slist_remelem(buckets, (u8 *) tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i))) {
+ printf("OUCH: remelem failure at index %d\n", i);
+ }
+ if (PREDICT_FALSE(tp[i].l.next != EMPTY)) {
+ printf("OUCH: post-remelem next not EMPTY, index %d\n", i);
+ }
+ }
+
+ printf ("after deletes, buckets[0] = %x\n", buckets[0]);
+
+ for (i = 0; i < 10; i++) {
+ index_slist_addhead(buckets, (u8 *)tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+ }
+
+ printf ("after adds, buckets[0] = %u\n", buckets[0]);
+
+ for (j = 0; j < 10; j++) {
+ printf("tp[%d] next %u\n", j, tp[j].l);
+
+ }
+
+ for (i = 9; i >= 0; i--) {
+ if (PREDICT_FALSE(index_slist_remelem(buckets, (u8 *) tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i))) {
+ printf("OUCH: remelem failure at index %d\n", i);
+ }
+ if ((tp[i].l.next != EMPTY)) {
+ printf("OUCH: post-remelem next not EMPTY, index %d\n", i);
+ }
+ }
+
+ printf ("after deletes, buckets[0] = %x\n", buckets[0]);
+
+ printf("add evens, then odds...\n");
+
+ for (i = 0; i < 10; i += 2) {
+ index_slist_addhead(buckets, (u8 *)tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+
+ printf ("head = buckets[0].next = %d\n", buckets[0].next);
+ for (j = 0; j < 10; j++) {
+ printf("tp[%d] next %u\n", j, tp[j].l);
+ }
+ printf("-------------\n");
+ }
+
+ for (i = 1; i < 10; i += 2) {
+ index_slist_addhead(buckets, (u8 *)tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i);
+
+ printf ("head = buckets[0].next = %d\n", buckets[0].next);
+ for (j = 0; j < 10; j++) {
+ printf("tp[%d] next %u\n", j, tp[j].l);
+ }
+ printf("-------------\n");
+ }
+
+ printf ("after adds, buckets[0] = %u\n", buckets[0]);
+
+ for (j = 0; j < 10; j++) {
+ printf("tp[%d] next %u\n", j, tp[j].l);
+
+ }
+
+ for (i = 9; i >= 0; i--) {
+ if (PREDICT_FALSE(index_slist_remelem(buckets, (u8 *) tp, sizeof(*tp),
+ STRUCT_OFFSET_OF(tv_t, l), i))) {
+ printf("OUCH: remelem failure at index %d\n", i);
+ }
+ if (PREDICT_FALSE(tp[i].l.next != EMPTY)) {
+ printf("OUCH: post-remelem next not EMPTY, index %d\n", i);
+ }
+ }
+
+ printf ("after deletes, buckets[0] = %x\n", buckets[0]);
+
+ vec_free(buckets);
+ vec_free(tp);
+}
+#endif /* test code */
diff --git a/vnet/vnet/vcgn/index_list.h b/vnet/vnet/vcgn/index_list.h
new file mode 100644
index 00000000000..498cd7eb7ad
--- /dev/null
+++ b/vnet/vnet/vcgn/index_list.h
@@ -0,0 +1,118 @@
+/*
+ *------------------------------------------------------------------
+ * index_list.h - vector-index-based doubly-linked lists
+ *
+ * Copyright (c) 2008-2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef _INDEX_LIST_H_
+#define _INDEX_LIST_H_ 1
+
+/* An index we can't possibly see in practice... */
+#define EMPTY ((u32)~0)
+
+typedef struct index_slist_ {
+ u32 next;
+} index_slist_t;
+
+/*
+ * index_slist_addhead
+ *
+ * args: headp -- pointer to e.g. a hash bucket
+ * vector -- vector containing the list
+ * elsize -- size of an element in this vector
+ * offset -- offset in each vector element of this list thread
+ * index_to_add -- index in the vector to add to the list
+ *
+ * Adds new items to the head of the list. Try not to screw up the args!
+ */
+static inline void
+ index_slist_addhead_inline (index_slist_t *headp,
+ u8 *vector, u32 elsize,
+ u32 offset, u32 index_to_add)
+{
+ index_slist_t *addme;
+
+ addme = (index_slist_t *)(vector + offset + elsize*index_to_add);
+ addme->next = EMPTY;
+
+ if (headp->next == EMPTY) {
+ headp->next = index_to_add;
+ return;
+ } else {
+ addme->next = headp->next;
+ headp->next = index_to_add;
+ }
+}
+
+/*
+ * index_slist_remelem
+ *
+ * args: headp -- pointer to e.g. a hash bucket
+ * vector -- vector containing the list
+ * elsize -- size of an element in this vector
+ * offset -- offset in each vector element of this list thread
+ * index_to_del -- index in the vector to delete from the list
+ *
+ * Try not to screw up the args!
+ */
+
+static inline int
+ index_slist_remelem_inline (index_slist_t *headp,
+ u8 *vector, u32 elsize,
+ u32 offset, u32 index_to_delete)
+{
+ index_slist_t *findme;
+ index_slist_t *prev;
+ index_slist_t *cur;
+
+ findme = (index_slist_t *)(vector + offset + elsize*index_to_delete);
+
+ if (headp->next == index_to_delete) {
+ headp->next = findme->next;
+ findme->next = EMPTY;
+ return 0;
+ }
+
+ prev = (index_slist_t *)(vector + offset + elsize*headp->next);
+ cur = (index_slist_t *)(vector + offset + elsize*prev->next);
+ while (cur != findme) {
+ if (cur->next == EMPTY)
+ return (1);
+ prev = cur;
+ cur = (index_slist_t *)(vector + offset + elsize*cur->next);
+ }
+ prev->next = findme->next;
+ findme->next = EMPTY;
+ return 0;
+}
+
+void index_slist_addhead (index_slist_t *headp,
+ u8 *vector, u32 elsize, u32 offset, u32 index);
+int index_slist_remelem (index_slist_t *headp,
+ u8 *vector, u32 elsize, u32 offset, u32 index);
+
+typedef struct index_dlist_ {
+ u32 next;
+ u32 prev;
+} index_dlist_t;
+
+void index_dlist_addtail (u32 head_index, u8 *vector, u32 elsize,
+ u32 offset, u32 index_to_add);
+
+u32 index_dlist_remelem (u32 head_index,
+ u8 *vector, u32 elsize, u32 offset,
+ u32 index_to_delete);
+#endif /* _INDEX_LIST_H_ */
diff --git a/vnet/vnet/vcgn/nat64_db.h b/vnet/vnet/vcgn/nat64_db.h
new file mode 100644
index 00000000000..837464f6940
--- /dev/null
+++ b/vnet/vnet/vcgn/nat64_db.h
@@ -0,0 +1,480 @@
+/*
+ *------------------------------------------------------------------
+ * nat64_db.h - Stateful NAT64 translation database definitions
+ *
+ * Copyright (c) 2010-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#ifndef __NAT64_DB_H__
+#define __NAT64_DB_H__
+
+#include "cnat_cli.h"
+#include "index_list.h"
+#include "cnat_ports.h"
+#include "cnat_db.h"
+#include "nat64_defs.h"
+#include "cnat_bulk_port_defs.h"
+
+nat64_vrfmap_t *nat64_map_by_vrf;
+
+#define SESSION_OPT
+
+#define HASH_ENHANCE 4
+
+
+#define NAT64_MAIN_DB_SIZE \
+ (PLATFORM_NAT64_MAX_SESSIONS / PLATFORM_CNAT_INSTS)
+#define NAT64_MAIN_HASH_SIZE \
+ (HASH_ENHANCE * PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE)
+
+#define NAT64_MAIN_HASH_MASK (NAT64_MAIN_HASH_SIZE-1)
+
+
+/* nb: 200000 users / 64 CNAT = 3125, 76% occupancy */
+#define NAT64_USER_HASH_SIZE CNAT_USER_HASH_SIZE
+#define NAT64_USER_HASH_MASK (NAT64_USER_HASH_SIZE-1)
+
+/* Number of sessions per BIB entry/NAT64 translation
+ - nsessions is u16 type. So selected 0xFFFF
+ - Ideally Sessions per transltion will not reach the limit
+ - Only DoS can possible. It can take care of it */
+#define NAT64_MAX_SESSIONS_PER_BIB 0xFFFF
+
+/* No. of per ip/port config will be limited to 1000 */
+/* totally 25K across all instances) */
+#define NAT64_TIMEOUT_HASH_SIZE \
+ PLATFORM_NAT64_TIMEOUT_HASH_SIZE
+
+#define NAT64_TIMEOUT_HASH_MASK (NAT64_TIMEOUT_HASH_SIZE - 1)
+#define NAT64_TIMEOUT_FULL_MASK 0xFFFFFFFFFFFFFFFF
+
+
+#define FORCE_DEL 1 /* Delete static BIB entries as well */
+
+/* default timeout values */
+#define NAT64_UDP_DEFAULT 300 /* 5 min */
+#define NAT64_UDP_MIN 120 /* 2 min */
+#define NAT64_TCP_TRANS 240 /* 4 min */
+#define NAT64_TCP_EST 7200 /* 2 hrs */
+#define NAT64_TCP_V4_SYN 6 /* 6 sec */
+#define NAT64_FRAG_MIN 2 /* 2 sec */
+#define NAT64_ICMP_DEFAULT 60 /* 1 min */
+
+
+#define NAT64_V6_GET_HASH(in_key, hash, mask) \
+ a = in_key->ipv6[0] ^ in_key->ipv6[1] ^ in_key->ipv6[2] ^ in_key->ipv6[3] \
+ ^ ((in_key->port << 16) | in_key->vrf); \
+ b = c = 0x9e3779b9;\
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask; \
+
+
+#define NAT64_V4_GET_HASH(in_key, hash, mask) \
+ a = in_key.ipv4 ^ ((in_key.port << 16) | in_key.vrf); \
+ b = c = 0x9e3779b9; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask;
+
+
+
+#define NAT64_V6_GET_SESSION_HASH(bib_index, in_addr, port, vrf, hash, mask) \
+ a = bib_index ^ in_addr[0] ^ in_addr[1] ^ in_addr[2] ^ in_addr[3] \
+ ^ port ^ vrf; \
+ b = c = 0x9e3779b9; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask;
+
+#define NAT64_V4_GET_SESSION_HASH(bib_index, in_addr, port, vrf, hash, mask) \
+ a = bib_index ^ in_addr ^ port ^ vrf; \
+ b = c = 0x9e3779b9; \
+ /* Jenkins hash, arbitrarily use c as the "answer" */ \
+ hash_mix32(a, b, c); \
+ hash = c & mask;
+
+
+extern index_slist_t *nat64_bib_out2in_hash;
+extern index_slist_t *nat64_bib_in2out_hash;
+extern index_slist_t *nat64_bib_user_hash;
+extern index_slist_t *nat64_session_out2in_hash;
+#ifndef SESSION_OPT
+extern index_slist_t *nat64_session_in2out_hash;
+#endif
+extern index_slist_t *nat64_frag_out2in_hash;
+extern index_slist_t *nat64_frag_in2out_hash;
+extern index_slist_t *nat64_timeout_hash;
+
+
+/*
+ * nat64_ bib_entry_t
+ * This structure depicts Binding Information Base of NAT64 sessions.
+ * It stores information about the inside v6 source transport address and
+ * corresponding outside v4 source transport address for each protocol.
+ */
+
+typedef struct {
+
+ index_slist_t nat64_bib_out2in_hash;
+ index_slist_t nat64_bib_in2out_hash;
+
+ /* 0x08 */
+ u16 flags; /* flags in cnat_db.h (cnat_main_db_entry_t) */
+#define NAT64_DB_FLAG_STATIC_PORT CNAT_DB_FLAG_STATIC_PORT
+#define NAT64_DB_NAT64_FLAG CNAT_DB_NAT64_FLAG
+#define NAT64_DB_FLAG_ALG_ENTRY CNAT_DB_FLAG_ALG_ENTRY
+#define NAT64_DB_FLAG_PCPI CNAT_DB_FLAG_PCPI
+#define NAT64_DB_FLAG_PCPE CNAT_DB_FLAG_PCPE
+
+ /* 0x0A */
+ u16 nat64_inst_id;
+ /* 0x0C */
+ u32 user_index;
+
+ /* 0x10 */
+ nat64_v4_key_t v4_out_key;
+
+ /* 0x18 */
+ nat64_v6_key_t v6_in_key;
+
+ /* 0x2C */
+ index_dlist_t user_ports;
+ /* 0x34 */
+ u32 session_head_index;
+ /* 0x38 - 56B*/
+ u16 nsessions;
+ u16 pad2;
+
+ /* 0x3C - 60B */
+ u32 in2outpkts;
+ u32 out2inpkts;
+ /* 0x44 - 68B */
+
+ /* 0x42 - 70B */
+ union { /* used by FTP ALG, pkt len delta due to FTP PORT cmd */
+ u16 delta;
+ i8 alg_dlt[2]; /* two delta values, 0 for previous, 1 for current */
+ u16 il; /* Used to indicate if interleaved mode is used
+ in case of RTSP ALG */
+ } alg;
+
+ u16 temp1;
+
+ u32 entry_expires;
+
+ u32 temp3;
+ /* unused, temp1 ,temp2 and temp3 put to make it in sync with nat44 main db entry size */
+ /* size of = 0x54 = 84 B */
+ u32 unused;
+
+} nat64_bib_entry_t ;
+
+/*
+ * nat64_bib_user_entry_t
+ * This structure stores information about translations of a particular user
+ * (User here refers to a same inside source address)
+ */
+typedef struct {
+ /* 0x00 */
+ index_slist_t user_hash;
+ /* 0x04 */
+ u16 ntranslations;
+ /* 0x06 */
+ u8 icmp_msg_count;
+ /* 0x07 */
+ u8 flags;
+#define NAT64_USER_DB_NAT64_FLAG CNAT_USER_DB_NAT64_FLAG
+
+ /* 0x08 */
+ u32 translation_list_head_index;
+ /* 0x0C */
+ u32 portmap_index;
+ /* 0x10 */
+ nat64_v6_key_t v6_in_key;
+ /* 0x24 = 36 B */
+
+ u32 align1; /* Make it 8B boundary and in sync with nat44 user db entry size */
+#ifndef NO_BULK_LOGGING
+ /* size of = 0x28 = 40 B */
+ /* Now adding 8 more bytes for bulk allocation.. This makes it
+ * 0x30 (48). For nat64 stful, we may support bulk allocation
+ * later */
+ /* Indicates the currently used bulk port range */
+ i16 bulk_port_range_cache[BULK_RANGE_CACHE_SIZE];
+#endif /* NO_BULK_LOGGING */
+} nat64_bib_user_entry_t;
+
+/*
+ * nat64_session_entry_t
+ * This structure represents the session table. It maintains the information
+ * about the flow of the packets. It would consist of source and destination
+ * (inside and outside) ipv4 and ipv4 transport addresses.
+ */
+typedef struct {
+
+ /* 0x00 */
+ index_slist_t nat64_session_out2in_hash;
+
+ /* 0x04 */
+ u32 bib_index; /* would point to v4/v6 src transport address */
+
+ /* 0x08 */
+ nat64_v4_key_t v4_dest_key;
+
+#ifndef SESSION_OPT
+ index_slist_t nat64_session_in2out_hash;
+ nat64_v6_key_t v6_dest_key;
+#endif
+
+ /* 0x10 */
+ u16 flags;/* Will be used for flags same as nat44 session */
+
+ /* 0x12 */
+ u16 timeout;
+
+ /* 0x14 */
+ u32 entry_expires;
+ /* 0x18 */
+ index_dlist_t bib_list;
+ /* 0x20 = 32 B */
+
+ union { /* alg same as cnat_main_db_t */
+ u16 delta;
+ i8 alg_dlt[2];
+ u16 il;
+ } alg;
+
+ /* 0x22 */
+ u16 tcp_flags; /* Mainly TCP events - check nat64_tcp_sm.h */
+
+ /* 0x24 */
+ u32 tcp_seq_num;
+
+ /* 0x28 */ /* unused1, unused2 and unused3 are put to make it in sync with
+ * cnat_session_db */
+ u32 unused1;
+
+ /* 0x2C */
+ u32 unused2;
+
+ /* 0x30 */
+ u16 unused3;
+
+ /* 0x32 - 50B */
+
+} nat64_session_entry_t;
+
+/*
+ * nat64_session_tcp_init_entry_t
+ * This structure will be used to store information about v4 initiation
+ * tcp entries.
+ */
+typedef struct {
+ nat64_v6_key_t v6_in_key;
+ nat64_v4_key_t v4_out_key;
+} nat64_session_tcp_init_entry_t;
+
+/*
+ * nat64_in_v6_frag_entry_t
+ * This structure will be used to store information about fragment flows
+ * that are coming from inside v6 hosts.
+ */
+typedef struct {
+ index_slist_t nat64_frag_in2out_hash;
+
+ u32 v6_src_addr[4];
+ u32 v6_destn_addr[4];
+ u32 frag_iden;
+ u16 vrf;
+ u16 pad1;
+} nat64_in_v6_frag_entry_t ;
+
+/*
+ * nat64_out_v4_frag_entry_t
+ * This structure will be used to store information about fragment flows
+ * that are coming from outside v4 machines.
+ */
+typedef struct {
+ index_slist_t nat64_frag_out2in_hash;
+
+ u32 v4_src_addr;
+ u32 v4_destn_addr;
+ u16 frag_iden;
+ u16 vrf;
+} nat64_out_v4_frag_entry_t ;
+
+/*
+ * nat64_timeout _t
+ * These following structures will be used to store information destination
+ * timeouts configured.
+ */
+typedef struct {
+ nat64_v4_key_t timeout_key;
+ u16 timeout_value;
+} nat64_timeout_t;
+
+/*
+ * nat64_timeout_db_entry_t
+ */
+typedef struct {
+ nat64_timeout_t t_key;
+ index_slist_t t_hash;
+} nat64_timeout_db_entry_t;
+
+
+typedef union {
+ cnat_main_db_entry_t nat44_main_db;
+ nat64_bib_entry_t nat64_bib_db;
+} cgse_nat_db_entry_t;
+
+typedef union {
+ cnat_session_entry_t nat44_session_db;
+ nat64_session_entry_t nat64_session_db;
+} cgse_nat_session_db_entry_t;
+
+typedef union {
+ cnat_user_db_entry_t nat44_user_db;
+ nat64_bib_user_entry_t nat64_user_db;
+} cgse_nat_user_db_entry_t;
+
+extern index_slist_t *nat64_bib_out2in_hash;
+extern index_slist_t *nat64_bib_in2out_hash;
+extern index_slist_t *nat64_bib_user_hash;
+extern index_slist_t *nat64_session_out2in_hash;
+extern index_slist_t *nat64_session_in2out_hash;
+extern index_slist_t *nat64_frag_out2in_hash;
+extern index_slist_t *nat64_frag_in2out_hash;
+extern index_slist_t *nat64_timeout_hash;
+
+extern nat64_bib_entry_t *nat64_bib_db;
+extern nat64_bib_user_entry_t *nat64_bib_user_db;
+extern nat64_session_entry_t *nat64_session_db;
+extern nat64_in_v6_frag_entry_t *nat64_in_frag_db;
+extern nat64_out_v4_frag_entry_t *nat64_out_frag_db;
+extern nat64_session_tcp_init_entry_t *nat64_tcp_init_db ;
+extern nat64_timeout_db_entry_t *nat64_timeout_db;
+
+extern nat64_table_entry_t nat64_table_array[NAT64_MAX_NAT64_ENTRIES];
+extern nat64_table_entry_t *nat64_table_ptr;
+
+extern cgse_nat_db_entry_t *cgse_nat_db;
+extern cgse_nat_user_db_entry_t *cgse_user_db;
+extern cgse_nat_session_db_entry_t *cgse_session_db;
+
+void nat64_bib_user_db_delete (nat64_bib_user_entry_t *up);
+
+nat64_bib_user_entry_t*
+nat64_bib_user_db_create_entry(nat64_v6_key_t *uki, u32 bucket,
+ u32 portmap_index);
+
+nat64_bib_user_entry_t*
+nat64_bib_user_db_lookup_entry(nat64_v6_key_t *uki, u32 *bucket);
+
+
+nat64_bib_entry_t*
+nat64_bib_db_lookup_entry(nat64_v6_key_t *ki);
+
+void nat64_bib_db_in2out_hash_delete (nat64_bib_entry_t *ep);
+
+void nat64_bib_db_out2in_hash_delete (nat64_bib_entry_t *ep);
+
+nat64_bib_entry_t *
+nat64_create_bib_db_entry_and_hash(nat64_v6_key_t *ki,
+ nat64_v4_key_t *ko,
+ nat64_bib_user_entry_t *udb);
+
+
+void nat64_delete_bib_db_entry (nat64_bib_entry_t *ep, u8 force);
+
+nat64_bib_entry_t *
+nat64_bib_db_lookup_entry_out2in (nat64_v4_key_t *ko);
+
+nat64_bib_entry_t *
+nat64_get_bib_db_entry (nat64_v6_key_t *ki,
+ port_pair_t port_pair_type,
+ port_type_t port_type,
+ cnat_gen_icmp_info *info);
+
+
+nat64_bib_entry_t*
+nat64_create_static_bib_db_entry (nat64_v6_key_t *ki,
+ nat64_v4_key_t *ko,
+ nat64_table_entry_t *my_table,
+ cnat_gen_icmp_info *info);
+
+
+
+//void nat64_session_db_in2out_hash_delete (nat64_session_entry_t *ep);
+void nat64_session_db_out2in_hash_delete (nat64_session_entry_t *ep);
+
+/*nat64_session_entry_t *
+nat64_session_db_lookup_entry(nat64_v6_key_t *ki, u32 bib_index); */
+
+
+nat64_session_entry_t *
+nat64_session_db_lookup_entry_out2in (nat64_v4_key_t *ko,u32 bib_index);
+
+/*
+nat64_session_entry_t *
+nat64_create_session_db_entry(nat64_v6_key_t *ki,
+ nat64_v4_key_t *ko,
+ nat64_bib_entry_t *bdb);
+*/
+nat64_session_entry_t *
+nat64_create_session_db_entry_v2( nat64_v4_key_t *ko,
+ nat64_bib_entry_t *bdb);
+
+
+//void nat64_delete_session_db_entry (nat64_session_entry_t *ep);
+void nat64_delete_session_db_entry_v2 (nat64_session_entry_t *ep, u8 force);
+
+u32 nat64_timeout_db_hash_lookup (nat64_v4_key_t t_key);
+
+u16 query_and_update_db_timeout_nat64(nat64_session_entry_t *db);
+
+void nat64_timeout_db_hash_add (nat64_timeout_db_entry_t *t_entry);
+
+u16 nat64_timeout_db_create (nat64_timeout_t t_entry);
+
+void nat64_timeout_db_delete(nat64_v4_key_t t_key);
+
+#define NAT64_CMP_V6_KEY(key1, key2) \
+ memcmp(key1, key2, sizeof(nat64_v6_key_t))
+
+#define NAT64_CMP_V4_KEY(key1, key2) \
+ memcmp(key1, key2, sizeof(nat64_v4_key_t))
+
+
+#define NAT64_CMP_V6_IP(ip1, ip2) \
+ memcmp(ip1, ip2, (sizeof(u32) * 4))
+
+
+#define NAT64_CMP_V6_KEY1(key1, key2) \
+ (key1.ipv6[0] == key2.ipv6[0]) && (key1.ipv6[1] == key2.ipv6[1]) && \
+ (key1.ipv6[2] == key2.ipv6[2]) && (key1.ipv6[3] == key2.ipv6[3]) && \
+ (key1.port == key2.port) && (key1.vrf == key2.vrf)
+
+
+#define NAT64_CMP_V6_IP1(ip1, ip2) \
+ ((ip1[0] == ip2[0]) && (ip1[1] == ip2[1]) && \
+ (ip1[2] == ip2[2]) && (ip1[3] == ip2[3]))
+
+#define NAT64_CMP_V4_KEY1(key1, key2) \
+ (key1.key64 == key2.key64)
+
+
+extern u8 nat64_timeout_dirty_flag[NAT64_MAX_NAT64_ENTRIES];
+
+#endif
diff --git a/vnet/vnet/vcgn/nat64_defs.h b/vnet/vnet/vcgn/nat64_defs.h
new file mode 100644
index 00000000000..47e431a7462
--- /dev/null
+++ b/vnet/vnet/vcgn/nat64_defs.h
@@ -0,0 +1,576 @@
+/*
+ *------------------------------------------------------------------
+ * nat64_defs.h - NAT64 structure definiitions
+ *
+ * Copyright (c) 2007-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __NAT64_DEFS_H__
+#define __NAT64_DEFS_H__
+
+#ifdef TOBE_PORTED
+#include "spp_platform_common.h"
+#include "cgse_defs.h"
+#include "xlat_defs.h"
+#endif
+#include "cnat_cli.h"
+#include "cnat_ports.h"
+#include "tcp_header_definitions.h"
+#include "nat64_tcp_sm.h"
+#include "cnat_db.h"
+
+#define NAT64_MAX_FRAG_ID_COUNTERS (256)
+
+#define NAT64_MAX_NAT64_ENTRIES 500
+
+#define NAT64_MAX_ID (NAT64_MAX_NAT64_ENTRIES-1)
+
+#define NAT64_INVALID_ID (0)
+
+#define NAT64_MAX_CFG_INSTANCES 64
+
+#define NAT64_TABLE_ENTRY_DELETED 0
+#define NAT64_TABLE_ENTRY_ACTIVE 1
+#define NAT64_TABLE_ENTRY_DORMANT 2
+#define NAT64_TABLE_ENTRY_INVALID_UIDB 3
+
+#define NAT64_MAX_TRANSLATION_ENTRIES PLATFORM_MAX_TRANSLATION_ENTRIES
+
+#define NAT64_WKP_PREFIX_LEN 96
+#define NAT64_WKP_PREFIX_0 0x0064FF9B
+#define NAT64_WKP_PREFIX_1 0x00000000
+#define NAT64_WKP_PREFIX_2 0x00000000
+#define NAT64_WKP_PREFIX_3 0x00000000
+
+
+/* Reset the expiry time only if it is not 0
+** if it is 0 - then queue for delete by clear command
+**/
+
+#define NAT64_TIMEOUT_RST(db) \
+ if(PREDICT_TRUE(db->entry_expires !=0 )) \
+ db->entry_expires = cnat_current_time;
+
+extern u32 nat64_config_debug_level;
+extern u32 nat64_data_path_debug_level;
+
+extern u32 nat64_translation_create_count[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_translation_delete_count[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_translation_create_rate[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_translation_delete_rate[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_in2out_forwarding_count[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_in2out_forwarding_rate[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_out2in_forwarding_count[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_out2in_forwarding_rate[NAT64_MAX_NAT64_ENTRIES];
+
+extern u32 nat64_translation_create_count_old[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_translation_delete_count_old[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_in2out_forwarding_count_old[NAT64_MAX_NAT64_ENTRIES];
+extern u32 nat64_out2in_forwarding_count_old[NAT64_MAX_NAT64_ENTRIES];
+
+extern u16 *nat64_frag_id_counter_ptr;
+
+typedef struct {
+ u64 v6_to_v4_tcp_input_count;
+ u64 v6_to_v4_tcp_non_translatable_drop_count;
+ u64 v6_to_v4_tcp_state_drop_count;
+ u64 v6_to_v4_tcp_no_db_drop_count;
+ u64 v6_to_v4_tcp_output_count;
+} nat64_v6_to_v4_tcp_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_tcp_input_count;
+ u64 v4_to_v6_tcp_no_db_drop_count;
+ u64 v4_to_v6_tcp_v4_init_policy_drop_count;
+ u64 v4_to_v6_tcp_state_drop_count;
+ u64 v4_to_v6_tcp_output_count;
+ u64 v4_to_v6_tcp_filter_drop_count;
+} nat64_v4_to_v6_tcp_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_udp_input_count;
+ u64 v6_to_v4_udp_non_translatable_drop_count;
+ u64 v6_to_v4_udp_no_db_drop_count;
+ u64 v6_to_v4_udp_output_count;
+ u64 v6_to_v4_udp_checksum_zero_count;
+} nat64_v6_to_v4_udp_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_udp_input_count;
+ u64 v4_to_v6_udp_no_db_drop_count;
+ u64 v4_to_v6_udp_filter_drop_count;
+ u64 v4_to_v6_udp_output_count;
+ u64 v4_to_v6_udp_crc_zero_drop_count;
+ u64 v4_to_v6_udp_frag_crc_zero_drop_count;
+ u64 v4_to_v6_udp_crc_zero_recycle_sent_count;
+ u64 v4_to_v6_udp_crc_zero_recycle_drop_count;
+} nat64_v4_to_v6_udp_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_icmp_input_count;
+ u64 v6_to_v4_icmp_no_db_drop_count;
+ u64 v6_to_v4_icmp_non_translatable_drop_count;
+ u64 v6_to_v4_icmp_qry_output_count;
+} nat64_v6_to_v4_icmp_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_icmp_input_count;
+ u64 v4_to_v6_icmp_no_db_drop_count;
+ u64 v4_to_v6_icmp_filter_drop;
+ u64 v4_to_v6_icmp_qry_output_count;
+} nat64_v4_to_v6_icmp_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_icmp_error_input_count;
+ u64 v6_to_v4_icmp_error_no_db_drop_count;
+ u64 v6_to_v4_icmp_error_invalid_next_hdr_drop_count;
+ u64 v6_to_v4_icmp_error_non_translatable_drop_count;
+ u64 v6_to_v4_icmp_error_unsupported_type_drop_count;
+ u64 v6_to_v4_icmp_error_output_count;
+} nat64_v6_to_v4_icmp_error_counter_t;
+
+typedef struct {
+ u64 v4_to_v6_icmp_error_input_count;
+ u64 v4_to_v6_icmp_error_no_db_drop_count;
+ u64 v4_to_v6_icmp_error_unsupported_type_drop_count;
+ u64 v4_to_v6_icmp_error_unsupported_protocol_drop_count;
+ u64 v4_to_v6_icmp_error_output_count;
+} nat64_v4_to_v6_icmp_error_counter_t;
+
+
+
+typedef struct {
+ u64 nat64_v4_frag_input_count;
+ u64 nat64_v4_frag_forward_count;
+ u64 nat64_v4_frag_drop_count;
+ u64 nat64_v4_frag_throttled_count;
+ u64 nat64_v4_frag_timeout_drop_count;
+ u64 nat64_v4_frag_tcp_input_count;
+ u64 nat64_v4_frag_udp_input_count;
+ u64 nat64_v4_frag_icmp_input_count;
+
+ u64 nat64_v6_frag_input_count;
+ u64 nat64_v6_frag_forward_count;
+ u64 nat64_v6_frag_drop_count;
+ u64 nat64_v6_frag_throttled_count;
+ u64 nat64_v6_frag_timeout_drop_count;
+ u64 nat64_v6_frag_tcp_input_count;
+ u64 nat64_v6_frag_udp_input_count;
+ u64 nat64_v6_frag_icmp_input_count;
+ u64 nat64_v6_frag_invalid_input_count;
+} nat64_frag_counter_t;
+
+typedef struct {
+ u64 v6_to_v4_options_input_count;
+ u64 v6_to_v4_options_drop_count;
+ u64 v6_to_v4_options_forward_count;
+ u64 v6_to_v4_options_no_db_drop_count;
+ u64 v6_to_v4_unsupp_proto_count;
+
+ u64 v4_to_v6_options_input_count;
+ u64 v4_to_v6_options_drop_count;
+ u64 v4_to_v6_options_forward_count;
+ u64 v4_to_v6_options_no_db_drop_count;
+ u64 v4_to_v6_unsupp_proto_count;
+} nat64_options_counter_t;
+
+typedef struct {
+ u64 v4_icmp_gen_count;
+ u64 v6_icmp_gen_count;
+} nat64_icmp_gen_counter_t;
+
+typedef struct{
+ u32 nat64_num_translations;
+ u32 nat64_num_dynamic_translations;
+ u32 nat64_num_static_translations;
+ u32 nat64_sessions;
+ u64 nat64_port_limit_exceeded;
+ u64 nat64_system_limit_reached;
+ u64 nat64_resource_depletion_drops;
+ u64 nat64_no_translation_entry_drops;
+ u64 nat64_filtering_drops ;
+ u64 nat64_invalid_ipv6_prefix_drops;
+ u32 num_subscribers;
+ u32 dummy;
+ u64 drops_sessiondb_limit_exceeded;
+} nat64_inst_gen_counter_t;
+
+typedef struct {
+
+ nat64_v6_to_v4_tcp_counter_t v64_tcp_counters;
+ nat64_v4_to_v6_tcp_counter_t v46_tcp_counters;
+ nat64_v6_to_v4_udp_counter_t v64_udp_counters;
+ nat64_v4_to_v6_udp_counter_t v46_udp_counters;
+ nat64_v6_to_v4_icmp_counter_t v64_icmp_counters;
+ nat64_v4_to_v6_icmp_counter_t v46_icmp_counters;
+ nat64_v6_to_v4_icmp_error_counter_t v64_icmp_error_counters;
+ nat64_v4_to_v6_icmp_error_counter_t v46_icmp_error_counters;
+ nat64_frag_counter_t nat64_frag_counters;
+ nat64_options_counter_t nat64_options_counters;
+ nat64_icmp_gen_counter_t nat64_icmp_gen_counters;
+
+} nat64_counters_t;
+
+/*
+ * nat64_portmap_v2_t
+ * This structure stores information about the IP address and ports
+ * available for NAT for this nat64 instance.
+ */
+
+typedef struct {
+ u32 delete_time;
+ u32 last_sent_timestamp;
+ u32 inuse;
+ u32 ipv4_address; /* native bit order */
+ uword bm[(BITS_PER_INST + BITS(uword)-1)/BITS(uword)];
+} nat64_portmap_t;
+
+/*
+ * nat64_v4_db_key_t
+ * This structure gives information about the v4 transport address
+ * (ipv4, port, protocol)
+ */
+typedef struct {
+ u32 ipv4;
+ u16 port;
+ u16 vrf; //bit0-12:inst_id, bit13:unused, bit14-15:protocol
+} nat64_v4_db_key_t;
+
+/* Union will be easier while compare/hash */
+typedef union {
+ nat64_v4_db_key_t k;
+ u64 key64;
+} nat64_v4_key_t;
+/*
+ * nat64_v6_db_key_t
+ * This structure gives information about the v6 transport address
+ * (ipv6, port, protocol)
+ */
+typedef struct {
+ u32 ipv6[4];
+ u16 port;
+ u16 vrf; //bit0-12:inst_id, bit13:unused, bit14-15:protocol
+} nat64_v6_key_t;
+
+
+typedef struct {
+ u16 udp_timeout;
+ u16 tcp_trans_timeout;
+ u16 tcp_est_timeout;
+ u16 tcp_v4_init_timeout;
+ u16 frag_timeout;
+ u16 icmp_timeout;
+} nat64_timeout_info_t;
+
+#define NAT64_UDP_DEF 300 /* 5min */
+#define NAT64_TCP_TRANS_DEF 240 /* 4min */
+#define NAT64_TCP_EST_DEF 7200 /* 2Hrs */
+#define NAT64_TCP_V4_DEF 6 /* 6 sec */
+#define NAT64_FRAG_DEF 2 /* 2 sec */
+#define NAT64_ICMP_DEF 60 /* 60 sec */
+
+/*
+ * nat64_table_entry_t
+ * This structure is used to store information regarding every nat64 instance.
+ */
+
+/* structure will hold the L4 information, of a particular frag stream set
+ * src_port - holds the original src port
+ * dst_port - holds the original dst port
+ * total_len - useful only in ICMP nodes
+ * cnat_port - vlaue used for looksups
+ * next_prot - Protocol after translation */
+
+typedef struct l4_frag_info {
+ u16 next_node_idx;
+ u16 src_port;
+ u16 dst_port;
+ u16 total_length;
+ u8 protocol;
+ u16 cnat_prot;
+ u16 next_prot;
+} l4_frag_info_t;
+
+typedef struct {
+ u16 state;
+ u16 nat64_id; /* nat64_id value for this table entry - for easy access */
+
+ u16 v4_uidb_index; /* V4 uidb index */
+ u16 v6_uidb_index; /* V6 uidb index */
+
+ u8 octet0_position;
+ u8 octet1_position;
+ u8 octet2_position;
+ u8 octet3_position;
+
+ u16 v4_to_v6_tcp_mss; /* TCP MSS */
+ u16 v6_to_v4_tcp_mss; /* TCP MSS */
+
+ /*
+ * V6 NAT64 prefix value and mask size
+ */
+ u32 v6_prefix[4];
+ u32 v6_prefix_mask[4];
+
+ u8 v6_prefix_mask_len;
+ u8 ubits_reserved_on;
+#define IPV4_TOS_OVERRIDE_FLAG 0x1
+#define IPV6_TOS_OVERRIDE_FLAG 0x2
+#define NAT64_STFUL_RTSP_ALG_ENABLE 0x4
+ u8 feature_flags;
+
+ u8 ipv4_tos_value;
+ u8 ipv6_tos_value;
+ u8 df_bit_clear;
+ u8 ipv6_mtu_set;
+
+ u8 filtering_policy;
+#define NAT64_ADDRESS_DEPENDENT_ENABLE 1
+ u8 tcp_policy;
+#define NAT64_TCP_SECURITY_FLAG_DISABLE 1
+ u8 ftp_flags;
+
+ u8 tcp_v4_init_enable;
+#define NAT64_TCP_V4_INIT_ENABLE 1
+
+ u8 logging_policy;
+#define NAT64_BIB_LOG_ENABLE 0 /* Default */
+#define NAT64_SESSION_LOG_ENABLE 1
+
+#define NAT64_BIDIR_REFRESH 1 /* 1 - timer refresh in both direction */
+#define NAT64_UNIDIR_REFRESH 0 /* 0 - default (only v6 side refresh timer)*/
+
+ u8 nat64_refresh_both_direction; /* 0 - default (only v6 side refresh timer) */
+#define NAT64_BIDIR_REFRESH 1 /* 1 - timer refresh in both direction */
+
+ u8 udp_zero_checksum; /* 0 - default (calc checksum) */
+#define NAT64_UDP_ZERO_CHECKSUM_DROP 1 /* 1 -drop */
+
+ u16 port_limit;
+
+ cnat_portmap_v2_t *port_map;
+
+ u32 logging_index;
+
+ nat64_timeout_info_t timeout_info;
+ /*
+ * These fields are not used much, let us keep it in the end
+ */
+ u32 v4_vrf_id; /* V4 vrf id */
+ u32 v6_vrf_id; /* V6 vrf id */
+
+ u32 v4_if_num; /* V4 SVI ifnum */
+ u32 v6_if_num; /* V6 SVI ifnum */
+
+ u16 dyn_start_port;
+
+ u16 pcp_server_port;
+ u32 pcp_server_addr[4];
+ u32 rseed_ip;
+#define NAT64_FRAG_ENABLE 1
+#define NAT64_FRAG_DISABLE 0
+ u8 frag_state;
+ u8 nat64_enable; /* Enable/Disable this instance. */
+
+ u16 rtsp_port;
+
+} nat64_table_entry_t;
+
+
+
+extern nat64_table_entry_t nat64_table_array[NAT64_MAX_NAT64_ENTRIES];
+extern nat64_table_entry_t *nat64_table_ptr;
+extern nat64_counters_t nat64_all_counters[NAT64_MAX_NAT64_ENTRIES];
+extern nat64_inst_gen_counter_t nat64_inst_gen_counters[NAT64_MAX_NAT64_ENTRIES];
+
+typedef struct nat64_common_pipeline_data_ {
+#ifdef TOBE_PORTED
+ spp_node_main_vector_t *nmv;
+#endif
+
+ u16 *nat64_id_ptr;
+
+ nat64_table_entry_t *nat64_entry_ptr;
+
+} nat64_common_pipeline_data_t;
+
+typedef struct nat64_v6_to_v4_pipeline_data_ {
+ nat64_common_pipeline_data_t common_data;
+
+ u32 bib_bucket;
+ u32 session_bucket;
+
+ nat64_v6_key_t v6_in_key;
+ nat64_v6_key_t v6_dest_key;
+
+ /*
+ * IPv6 Data, everthing in host order except for the addr fields
+ */
+ u32 version_trafficclass_flowlabel;
+
+ u16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+
+ /*
+ * These Address fields are in Network Order, so that
+ * it is easy to extract the IPv4 address from them
+ */
+ u32 ipv6_src[4];
+
+ u32 ipv6_dst[4];
+
+ u8 frag_next_header;
+ u8 frag_reserved;
+ u16 frag_offset_res_m;
+ u32 frag_identification;
+
+ ipv4_header *ipv4_header;
+ union {
+ struct _v4_l4_info {
+ u8 *ipv4_l4_header;
+ u8 pad0;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+ } v4_l4_info;
+ struct _v4_icmp_info {
+ icmp_v4_t *ipv4_icmp_header;
+ u8 old_icmp_type;
+ u8 new_icmp_type;
+ u8 old_icmp_code;
+ u8 new_icmp_code;
+ u16 checksum;
+ u16 old_iden; // length (ICMP extn), ptr (param)
+ u16 new_iden; // ----- do -------------
+ u16 old_seq; // MTU for PTB case
+ u16 new_seq; // ----- do -------------
+ } v4_icmp_info;
+ struct _v4_udp_info {
+ udp_hdr_type_t *ipv4_udp_header;
+ u8 pad0;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+ } v4_udp_info;
+ struct _v4_tcp_info {
+ tcp_hdr_type *ipv4_tcp_header;
+ u16 old_src_port;
+ u16 new_src_port;
+ u16 dest_port;
+ nat64_tcp_events tcp_event;
+ } v4_tcp_info;
+ } l4_u;
+
+
+ l4_frag_info_t *frag_info; /* port for tcp/udp, ident - icmp */
+
+
+ /* Counters will be added here */
+ union {
+ nat64_v6_to_v4_tcp_counter_t *tcp_counter;
+ nat64_v6_to_v4_udp_counter_t *udp_counter;
+ nat64_v6_to_v4_icmp_counter_t *icmp_counter;
+ nat64_v6_to_v4_icmp_error_counter_t *icmp_error_counter;
+ nat64_frag_counter_t *frag_counter;
+ nat64_options_counter_t *options_counter;
+ } nat64_ctr_u;
+ nat64_icmp_gen_counter_t *icmp_gen_counter;
+} nat64_v6_to_v4_pipeline_data_t;
+
+
+typedef struct nat64_v4_to_v6_pipeline_data_ {
+ nat64_common_pipeline_data_t common_data;
+
+ u32 bib_bucket;
+ u32 session_bucket;
+
+ nat64_v4_key_t v4_src_key; /* Will be translated using Prefix */
+ nat64_v4_key_t v4_dest_key; /* will be the out key for NAT64 */
+
+ /*
+ * IPv4 data
+ */
+ u8 version_hdr_len_words;
+ u8 tos;
+ u16 total_len_bytes;
+
+ u16 identification;
+ u16 frag_flags_offset;
+
+ u8 ttl;
+ u8 protocol;
+ u16 l4_checksum;
+
+ u32 ipv4_src_addr;
+ u32 ipv4_dst_addr;
+
+ /*
+ * Pointers to IPv6 headers
+ */
+ ipv6_header_t *ipv6_header;
+ ipv6_frag_header_t *ipv6_frag_header;
+
+ union {
+ struct _v6_l4_info {
+ u8 *ipv6_l4_header;
+ u8 pad0;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+ } v6_l4_info;
+ struct _v6_icmp_info {
+ icmp_v6_t *ipv6_icmp_header;
+ u8 old_icmp_type;
+ u8 new_icmp_type;
+ u8 old_icmp_code;
+ u8 new_icmp_code;
+ u16 old_iden; // length (ICMP extn), ptr (param)
+ u16 new_iden; // ----- do -------------
+ u16 old_seq; // MTU for PTB case
+ u16 new_seq; // ----- do -------------
+ } v6_icmp_info;
+ struct _v6_udp_info {
+ udp_hdr_type_t *ipv6_udp_header;
+ u8 pad0;
+ u8 pad1;
+ u8 pad2;
+ u8 pad3;
+ } v6_udp_info;
+ struct _v6_tcp_info {
+ tcp_hdr_type *ipv6_tcp_header;
+ u16 old_dest_port;
+ u16 new_dest_port;
+ u16 src_port;
+ nat64_tcp_events tcp_event;
+ } v6_tcp_info;
+ } l4_u;
+
+ l4_frag_info_t *frag_info; /* port for tcp/udp, ident - icmp */
+
+ /* Need to add counters here */
+ union {
+ nat64_v4_to_v6_tcp_counter_t *tcp_counter;
+ nat64_v4_to_v6_udp_counter_t *udp_counter;
+ nat64_v4_to_v6_icmp_counter_t *icmp_counter;
+ nat64_v4_to_v6_icmp_error_counter_t *icmp_error_counter;
+ nat64_frag_counter_t *frag_counter;
+ nat64_options_counter_t *options_counter;
+ } nat64_ctr_u;
+ nat64_icmp_gen_counter_t *icmp_gen_counter;
+
+} nat64_v4_to_v6_pipeline_data_t;
+
+#endif
diff --git a/vnet/vnet/vcgn/nat64_tcp_sm.h b/vnet/vnet/vcgn/nat64_tcp_sm.h
new file mode 100644
index 00000000000..3a505bc1649
--- /dev/null
+++ b/vnet/vnet/vcgn/nat64_tcp_sm.h
@@ -0,0 +1,91 @@
+/*
+ *------------------------------------------------------------------
+ * nat64_tcp_sm.h - Stateful NAT64 translation TCP State machine
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#ifndef __NAT64_TCP_SM_H__
+#define __NAT64_TCP_SM_H__
+
+
+/* TCP States */
+typedef enum {
+ TCP_CLOSED,
+ TCP_V4_INIT,
+ TCP_V6_INIT,
+ TCP_ESTABLISHED,
+ TCP_V4_FIN_RCV,
+ TCP_V6_FIN_RCV,
+ TCP_V4V6_FIN_RCV,
+ TCP_TRANS,
+ TCP_NONE
+} nat64_tcp_states;
+
+/* TCP Events */
+typedef enum {
+ TCP_TIMEOUT_EV,
+ TCP_V6_SYN_EV,
+ TCP_V4_SYN_EV,
+ TCP_V6_FIN_EV,
+ TCP_V4_FIN_EV,
+ TCP_V6_RST_EV,
+ TCP_V4_RST_EV,
+ TCP_DEFAULT_EV,
+ TCP_EV_COUNT
+} nat64_tcp_events;
+
+/* TCP Actions */
+typedef enum {
+ TCP_FORWARD,
+ TCP_COND_FORWARD, /* Conditional forward, based on presence of
+ * session and bib entries */
+ TCP_STORE,
+ TCP_PROBE,
+ TCP_CREATE_SESSION,
+ TCP_DELETE_SESSION,
+ TCP_DROP,
+ TCP_ACTION_NONE,
+ TCP_ACTION_COUNT
+} nat64_tcp_actions;
+
+typedef struct {
+ nat64_tcp_states next_state;
+ nat64_tcp_actions action;
+} nat64_tcp_trans_t;
+
+typedef struct {
+ nat64_tcp_trans_t event[TCP_EV_COUNT];
+} nat64_tcp_state_trans_t;
+
+extern nat64_tcp_state_trans_t nat64_tcp_sm_lookup[TCP_NONE];
+
+/*
+inline void
+nat64_update_v6_to_v4_tcp (nat64_v6_to_v4_pipeline_data_t *pctx_ptr,
+ nat64_bib_entry_t *bib_ptr);
+
+inline u8 nat64_v6_to_v4_tcp_perform_action (
+ spp_ctx_t *ctx,
+ nat64_v6_to_v4_pipeline_data_t *pctx_ptr,
+ nat64_bib_entry_t *bib_db,
+ nat64_session_entry_t *session_db);
+
+inline void
+nat64_copy_tcp_into_pctx (nat64_v6_to_v4_pipeline_data_t *pctx_ptr);
+*/
+
+
+
+#endif
diff --git a/vnet/vnet/vcgn/platform_common.h b/vnet/vnet/vcgn/platform_common.h
new file mode 100644
index 00000000000..2805b6078ce
--- /dev/null
+++ b/vnet/vnet/vcgn/platform_common.h
@@ -0,0 +1,136 @@
+/*
+ *---------------------------------------------------------------------------
+ * platform_common.h -- file has all platform related macros defined as NULL
+ * included "platform_common_override.h will have actual
+ * platform specific defines
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+#ifndef __PLATFORM_COMMON_H__
+#define __PLATFORM_COMMON_H__
+
+/* $$$ FIXME causes printf format warnings */
+#define PLATFORM_DEBUG_PRINT(...) /* printf(__VA_ARGS__) */
+#define PLATFORM_FILL_DST_ADDR_PORT_TABLE
+#define PLATFORM_SET_CTX_RU_TX_FROM_NODE(ctx, value)
+#define PLATFORM_SET_CTX_RU_TX_DST_IPPORT_IDX(ctx, value)
+#define PLATFORM_SET_CTX_RU_TX_PKT_TYPE(ctx, type)
+#define PLATFORM_SET_RX_VRF(ctx, rx_vrf, hardcode, mask)
+#define PLATFORM_SET_TX_VRF(ctx, tx_vrf)
+/* PLATFORM_CNAT_SET_RX_VRF definition is not same as PLATFORM_SET_RX_VRF,
+ * So, maintaining two different definitions
+ */
+#define PLATFORM_CNAT_SET_RX_VRF(ctx, rx_vrf, proto)
+#define PLATFORM_CNAT_SET_TX_VRF(ctx, tx_vrf)
+
+#define PLATFORM_PRINT_TESTING_PG()
+#define PLATFORM_INIT_TEMP_SENSORS()
+#define PLATFORM_READ_CPU_SENSORS(value)
+#define PLATFORM_SET_TEMP_READ_BLOCK(var, val)
+#define PLATFORM_NFV9_DISP_NODE_IDX
+
+
+/* Assumption is, syslog packets
+ * are sent out via same channel as that of NFV9.
+ * Has to be overridden if this assumption is false.
+ */
+#define PLATFORM_SYSLOG_DISP_NODE_IDX PLATFORM_NFV9_DISP_NODE_IDX
+
+#define PLATFORM_CNAT_DB_DUMP_POLICY_PRINT()
+#define PLATFORM_PRINT_CTX_VALUES(ctx)
+#define PLATFORM_ADD_VRF_MAP_HANDLE_PARTITION(uidb_index, partition_id)
+#define PLATFORM_DEL_VRF_MAP_HANDLE_PARTITION(uidb_index, partition_id)
+#define PLATFORM_ALLOC_NFV9_PKT_BUFFER(ctx, to_lc_cpu)
+#define PLATFORM_SET_DSLITE_ENABLE_FLAG(uidb_index, dslite_id)
+#define PLATFORM_CHECK_DSLITE_ENABLE_FLAG
+
+#define PLATFORM_CNAT_INSTS 1
+#define PLATFORM_HANDLE_TTL_DECREMENT 0 // Don't handle TTL in NAT44 Application (default).
+
+// For ISM, we need to copy the ipv6->hop_limit to ipv4 ttl.
+#define PLATFORM_6RD_COPY_TTL_IPV6_TO_IPV4 0
+
+//For ISM case, need to allow as the TTL decrement happens at ingress LC
+#define PLATFORM_6RD_ALLOW_TTL_1 0
+
+#define PLATFORM_HANDLE_ICMP_TTL_EXPIRED 0 // Don't handle ICMP_ERROR msg for TTL <=1 in NAT44 App (default).
+
+#define PLATFORM_IPV4_FRAG_FRAG_HOLD_LIMIT 1
+#define PLATFORM_MAX_IPV4_CTX_ENTRIES 1
+#define PLATFORM_MAPE_FRAG 0
+
+#define PLATFORM_ADDR_MASK_PER_CORE 0
+#define PLATFORM_ADDR_MASK_PER_CORE_PER_PARTITION 0
+#define PLATFORM_MAX_CORES 1
+#define PLATFORM_MAX_CORES_PER_PARTITION 1
+#define PLATFORM_MAX_NAT_ENTRIES 1
+#define PLATFORM_MAX_USER_ENTRIES 1
+#define PLATFORM_CNAT_MAX_ADDR_POOL_SIZE 0x1
+#define PLATFORM_DBL_SUPPORT 0 /* Default no DBL support, no NAT44 session table */
+
+#define PLATFORM_MAX_DB_ENTRY_PER_SCAN 1
+#define PLATFORM_MAX_DB_ENTRY_SELECTED_PER_SCAN 1
+#define MAX_COMBINED_DB_ENTRIES_PER_SCAN 0
+
+#define PLATFORM_CNAT_TIMEOUT_IPPROT_MASK 0
+#define PLATFORM_CNAT_TIMEOUT_PORTPROT_MASK 0
+
+#define PLATFORM_MAX_SHOW_BUFFER_SIZE 1700
+#define PLATFORM_MAX_TRANSLATION_ENTRIES (50)
+#define PLATFORM_MAX_UTIL_ENTRIES (100)
+#define PLATFORM_MAX_NAT44_UTIL_ENTRIES ((64)/PLATFORM_MAX_CORES)
+
+#define PLATFORM_CNAT_NFV9_SHIM_HDR_OFFSET 0
+#define PLATFORM_CNAT_NFV9_L2_ENCAPS_OFFSET 0
+
+
+/* Below are nat64 statful related define */
+#define PLATFORM_NAT64_SET_RX_VRF(rx_vrf, proto, inst_id) \
+ rx_vrf = proto | (inst_id & CNAT_VRF_MASK);
+
+#define PLATFORM_NAT64_MAX_TRANSLATION_ENTRIES (30)
+#define PLATFORM_DS_LITE_MAX_TRANSLATION_ENTRIES (30)
+
+#define PLATFORM_SET_NAT64_ENABLE_FLAG(uidb_index, nat64_id) \
+ { \
+ nat64_set_enable_flag(nat64_id, ENABLE); \
+ }
+
+#define PLATFORM_CHECK_NAT64_ENABLE_FLAG 1
+#define PLATFORM_SET_MAPE_ENABLE_FLAG(uidb_index, mape_id)
+#define PLATFORM_CHECK_MAPE_ENABLE_FLAG 1
+
+/* very small number , PD has correct value.
+ this is bcoz, if platform doesnt support nat64..shudnt take too much..*/
+#define PLATFORM_NAT64_MAX_SESSIONS 10
+#define PLATFORM_NAT64_TIMEOUT_HASH_SIZE 10
+#define PLATFORM_MAP_ADDR_PER_CORE 1024
+
+#define ENABLE 1
+#define DISABLE 0
+
+/* Platform Xlat inline learn function */
+#define PLATFORM_INLINE_LEARN_FUNC(a,b,c)
+
+
+/* Checksum calculation to be done in software */
+#define PLATFORM_XLAT_SW_CHECKSUM_CALC 0
+
+
+/* Below include overrides all the above null defs and defines platform specific
+ define */
+#include "platform_common_override.h"
+
+#endif /* __PLATFORM_COMMON_H__ */
diff --git a/vnet/vnet/vcgn/platform_common_override.h b/vnet/vnet/vcgn/platform_common_override.h
new file mode 100644
index 00000000000..d6d3b0785b5
--- /dev/null
+++ b/vnet/vnet/vcgn/platform_common_override.h
@@ -0,0 +1,304 @@
+/*
+ *---------------------------------------------------------------------------
+ * platform_common_override.h -- Files has actual platform specific defines.
+ * Will only included by platform_common.h
+ *
+ * Copyright (c) 2011-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#ifndef __PLATFORM_COMMON_OVERRIDE_H__
+#define __PLATFORM_COMMON_OVERRIDE_H__
+
+extern unsigned char my_octeon_id;
+
+#undef PLATFORM_DBL_SUPPORT
+#define PLATFORM_DBL_SUPPORT 1 // Destination Based logging support
+ // NAT44 session table required.
+
+#undef PLATFORM_ADDR_MASK_PER_CORE
+/* commenting this. Currently we are considering only single core */
+//#define PLATFORM_ADDR_MASK_PER_CORE 0x3f // Using 64 cores
+#define PLATFORM_ADDR_MASK_PER_CORE 0x01
+
+#undef MAX_COMBINED_DB_ENTRIES_PER_SCAN
+#define MAX_COMBINED_DB_ENTRIES_PER_SCAN 128
+
+#undef PLATFORM_MAX_CORES
+#define PLATFORM_MAX_CORES (PLATFORM_ADDR_MASK_PER_CORE + 1)
+
+// Roddick does not have any partition of cores
+#undef PLATFORM_ADDR_MASK_PER_CORE_PER_PARTITION
+#define PLATFORM_ADDR_MASK_PER_CORE_PER_PARTITION \
+ PLATFORM_ADDR_MASK_PER_CORE
+
+#undef PLATFORM_MAX_CORES_PER_PARTITION
+#define PLATFORM_MAX_CORES_PER_PARTITION PLATFORM_MAX_CORES
+
+#undef PLATFORM_CNAT_INSTS
+//#define PLATFORM_CNAT_INSTS 64
+#define PLATFORM_CNAT_INSTS 1 /* currently its only single instance */
+
+#undef PLATFORM_MAX_NAT_ENTRIES
+//#define PLATFORM_MAX_NAT_ENTRIES 20000000 // 20M
+#define PLATFORM_MAX_NAT_ENTRIES 1666660 // ~80M/48 (79999680/48)
+
+#undef PLATFORM_MAX_USER_ENTRIES
+#define PLATFORM_MAX_USER_ENTRIES 20800 // ~1M/48 (998400/48)
+
+
+/* 524288:
+ (20000000 translations) / (64 CNAT INSTANCES) = 312500
+ nearest higher number which is power of 2 next to 312500
+*/
+#undef PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE
+//#define PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE 524288
+#define PLATFORM_CNAT_MAIN_PRELIM_HASH_SIZE (5<<20)
+/* 4096:
+ (200000 users) / (64 CNAT INSTANCES) = 3125
+ nearest higher number which is power of 2 next to 3125
+*/
+#undef PLATFORM_CNAT_USER_PRELIM_HASH_SIZE
+#define PLATFORM_CNAT_USER_PRELIM_HASH_SIZE 4096
+
+#undef PLATFORM_CNAT_MAX_ADDR_POOL_SIZE
+#define PLATFORM_CNAT_MAX_ADDR_POOL_SIZE 0x10000 // max /16
+
+#undef PLATFORM_MAX_DB_ENTRY_PER_SCAN
+#define PLATFORM_MAX_DB_ENTRY_PER_SCAN 400
+
+#undef PLATFORM_MAX_DB_ENTRY_SELECTED_PER_SCAN
+#define PLATFORM_MAX_DB_ENTRY_SELECTED_PER_SCAN 100 // 1/4th of above
+
+#undef PLATFORM_CNAT_TIMEOUT_IPPROT_MASK
+#define PLATFORM_CNAT_TIMEOUT_IPPROT_MASK 0xFFFFFFFF0000FFFF
+
+#undef PLATFORM_CNAT_TIMEOUT_PORTPROT_MASK
+#define PLATFORM_CNAT_TIMEOUT_PORTPROT_MASK 0x00000000FFFFFFFF
+
+#ifdef TARGET_RODDICK /* EVB doesnt need it */
+#undef PLATFORM_FILL_DST_ADDR_PORT_TABLE
+#define PLATFORM_FILL_DST_ADDR_PORT_TABLE fill_dst_addr_port_table();
+#endif
+
+
+#ifndef RODDICK_ON_LINUX_OR_EVB
+#undef PLATFORM_SET_CTX_RU_TX_FROM_NODE
+#undef PLATFORM_SET_CTX_RU_TX_DST_IPPORT_IDX
+#undef PLATFORM_SET_CTX_RU_TX_PKT_TYPE
+
+#define PLATFORM_SET_CTX_RU_TX_FROM_NODE(ctx, value) \
+ (vnet_buffer(ctx))->vcgn_uii.ru.tx.from_node = value;
+#define PLATFORM_SET_CTX_RU_TX_DST_IPPORT_IDX(ctx, value) \
+ (vnet_buffer(ctx))->vcgn_uii.ru.tx.dst_ip_port_idx = value;
+#define PLATFORM_SET_CTX_RU_TX_PKT_TYPE(ctx, type) \
+ (vnet_buffer(ctx))->vcgn_uii.ru.tx.packet_type = type;
+#endif
+
+#undef PLATFORM_SET_RX_VRF
+#undef PLATFORM_SET_TX_VRF
+#ifdef TARGET_RODDICK
+#define PLATFORM_SET_RX_VRF(ctx, rx_vrf, hardcode, mask) \
+ rx_vrf = (ctx->ru.rx.uidb_index & CNAT_VRF_MASK);
+#define PLATFORM_SET_TX_VRF(ctx, tx_vrf) \
+ ctx->ru.tx.uidb_index = tx_vrf;
+#else /*EVB */
+#define PLATFORM_SET_RX_VRF(ctx, rx_vrf, hardcode, mask) \
+ rx_vrf = hardcode;
+#define PLATFORM_SET_TX_VRF(ctx, tx_vrf)
+#endif
+
+#undef PLATFORM_CNAT_SET_RX_VRF
+#undef PLATFORM_CNAT_SET_TX_VRF
+
+#define PLATFORM_CNAT_SET_RX_VRF(if_index, rx_vrf, proto) \
+ rx_vrf = proto | ((if_index) & CNAT_VRF_MASK);
+
+#define PLATFORM_CNAT_SET_TX_VRF(if_index, tx_vrf) \
+ (if_index) = ((tx_vrf) & CNAT_VRF_MASK);
+
+
+
+#undef PLATFORM_NAT64_SET_RX_VRF
+
+#ifdef TARGET_RODDICK
+
+#define PLATFORM_NAT64_SET_RX_VRF(rx_vrf, proto, inst_id) \
+ rx_vrf = proto | (inst_id & CNAT_VRF_MASK);
+
+#else /* EVB */
+
+#define PLATFORM_NAT64_SET_RX_VRF(rx_vrf, proto, inst_id) \
+ rx_vrf = proto | inst_id;
+
+#endif
+
+#ifdef TARGET_EVB
+#define VRF_MAP_CONFIG
+#endif
+
+#undef PLATFORM_PRINT_TESTING_PG
+#if defined(TARGET_LINUX_UDVR) || defined(CNAT_PG)
+#define PLATFORM_PRINT_TESTING_PG() printf("testing pg\n");
+#else
+#define PLATFORM_PRINT_TESTING_PG()
+#endif
+
+#ifdef TARGET_RODDICK
+#undef PLATFORM_INIT_TEMP_SENSORS
+#undef PLATFORM_READ_CPU_SENSORS
+#undef PLATFORM_SET_TEMP_READ_BLOCK
+
+#define PLATFORM_INIT_TEMP_SENSORS() Init_temperature_sensors();
+#define PLATFORM_READ_CPU_SENSORS(value) read_octeon_sensors(value);
+#define PLATFORM_SET_TEMP_READ_BLOCK(var, val) var = &val->param[0];
+#endif
+
+#undef PLATFORM_HANDLE_ICMP_TTL_EXPIRED
+#define PLATFORM_HANDLE_ICMP_TTL_EXPIRED 1 // handle TTL in NAT44 Application (for AVSM)
+
+#undef PLATFORM_NFV9_DISP_NODE_IDX
+#ifdef TARGET_RODDICK
+#define PLATFORM_NFV9_DISP_NODE_IDX "roddick_infra_l3_tx"
+#else /* EVB */
+#define PLATFORM_NFV9_DISP_NODE_IDX "cnat_rewrite_output"
+#endif
+
+#undef PLATFORM_CNAT_DB_DUMP_POLICY_PRINT
+#define PLATFORM_CNAT_DB_DUMP_POLICY_PRINT() \
+ printf("my instance:%d\n" \
+ "icmp timeout %d udp init timeout %d act timeout %d\n" \
+ "tcp init timeout %d act timeout %d mapping refresh %d\n" \
+ "port limit per user %d ftp alg %d lb debug %d\n" \
+ "icmp rate limit 0x%x config delete timer 0x%x\n" \
+ "global debug flag 0x%x\n" \
+ "icmp rate limit (pkts/per sec) %d\n" \
+ "dynamic port range start %d\n" \
+ "debug ivrf 0x%x flag 0x%x start_addr 0x%x end_addr 0x%x\n" \
+ "debug ovrf 0x%x flag 0x%x start_addr 0x%x end_addr 0x%x\n", \
+ my_instance_number, \
+ icmp_session_timeout, udp_init_session_timeout, udp_act_session_timeout, \
+ tcp_initial_setup_timeout, tcp_active_timeout, \
+ mapping_refresh_both_direction, cnat_main_db_max_ports_per_user, \
+ ftp_alg_enabled, lb_debug_enable, per_user_icmp_msg_limit, \
+ config_delete_timeout, \
+ global_debug_flag, \
+ cnat_main_db_icmp_rate_limit, \
+ cnat_static_port_range, \
+ debug_i_vrf, debug_i_flag, debug_i_addr_start, debug_i_addr_end, \
+ debug_o_vrf, debug_o_flag, debug_o_addr_start, debug_o_addr_end);
+
+
+#undef PLATFORM_PRINT_CTX_VALUES
+#ifdef TARGET_RODDICK
+#define PLATFORM_PRINT_CTX_VALUES(ctx) \
+ printf("\nAFTER: %s cur_hdr %p, uidb %d, pkt_type %d, cur_len %d\n", \
+ type_str, \
+ ctx->current_header, \
+ ctx->ru.tx.uidb_index, \
+ ctx->ru.tx.packet_type, \
+ ctx->current_length);
+#else /* EVB */
+#define PLATFORM_PRINT_CTX_VALUES(ctx) \
+ printf("\nAFTER: %s cur_hdr %p, cur_len %d\n", \
+ type_str,\
+ ctx->current_header, \
+ ctx->current_length);
+#endif
+
+#undef PLATFORM_ADD_VRF_MAP_HANDLE_PARTITION
+#define PLATFORM_ADD_VRF_MAP_HANDLE_PARTITION(uidb_index, partition_id)
+
+#undef PLATFORM_DEL_VRF_MAP_HANDLE_PARTITION
+#define PLATFORM_DEL_VRF_MAP_HANDLE_PARTITION(uidb_index, partition_id)
+
+#undef PLATFORM_ALLOC_NFV9_PKT_BUFFER
+#define PLATFORM_ALLOC_NFV9_PKT_BUFFER(ctx, to_lc_cpu)
+
+#undef PLATFORM_CNAT_NFV9_SHIM_HDR_OFFSET
+#ifdef TARGET_RODDICK
+// This corresponds to the length of the IMETRO SHIM Header for RODDICK
+#define PLATFORM_CNAT_NFV9_SHIM_HDR_OFFSET 8
+#else
+#define PLATFORM_CNAT_NFV9_SHIM_HDR_OFFSET 0
+#endif
+
+#undef PLATFORM_CNAT_NFV9_L2_ENCAPS_OFFSET
+#ifdef TARGET_RODDICK
+#define PLATFORM_CNAT_NFV9_L2_ENCAPS_OFFSET 0
+#else
+#define PLATFORM_CNAT_NFV9_L2_ENCAPS_OFFSET 16
+#endif
+
+#undef PLATFORM_MAX_SHOW_BUFFER_SIZE
+#undef PLATFORM_MAX_TRANSLATION_ENTRIES
+#undef PLATFORM_MAX_UTIL_ENTRIES
+
+#define PLATFORM_MAX_SHOW_BUFFER_SIZE 1700
+#define PLATFORM_MAX_TRANSLATION_ENTRIES (50)
+#define PLATFORM_NAT64_MAX_TRANSLATION_ENTRIES (30)
+#define PLATFORM_MAX_UTIL_ENTRIES (100)
+
+
+#undef PLATFORM_NAT64_MAX_SESSIONS
+#undef PLATFORM_NAT64_TIMEOUT_HASH_SIZE
+#define PLATFORM_NAT64_MAX_SESSIONS 20000000
+#define PLATFORM_NAT64_TIMEOUT_HASH_SIZE 24001 /* Ref: CSCtr36242 */
+
+#undef PLATFORM_CHECK_DSLITE_ENABLE_FLAG
+#define PLATFORM_CHECK_DSLITE_ENABLE_FLAG 1
+
+/* Fragment hold limit is Platform specific */
+/* For Roddick, it is 63 due to hardware limitation */
+#undef PLATFORM_IPV4_FRAG_FRAG_HOLD_LIMIT
+#define PLATFORM_IPV4_FRAG_FRAG_HOLD_LIMIT 63
+
+#undef PLATFORM_MAX_IPV4_CTX_ENTRIES
+#define PLATFORM_MAX_IPV4_CTX_ENTRIES 80
+
+#undef PLATFORM_DIRN_IN_TO_OUT
+#undef PLATFORM_DIRN_OUT_TO_IN
+#undef PLATFORM_SET_SVI_PARAMS_FIELD
+
+#define PLATFORM_DIRN_IN_TO_OUT
+#define PLATFORM_DIRN_OUT_TO_IN
+#define PLATFORM_SET_SVI_PARAMS_FIELD(var, value)
+
+#undef PLATFORM_GET_NFV9_L3_HDR_OFFSET
+#define PLATFORM_GET_NFV9_L3_HDR_OFFSET \
+ ((u8 *)ctx + ctx->data + CNAT_NFV9_IP_HDR_OFFSET);
+
+#undef PLATFORM_GET_NFV9_L4_HDR_OFFSET
+#define PLATFORM_GET_NFV9_L4_HDR_OFFSET \
+ ((u8 *) ctx + ctx->data + CNAT_NFV9_UDP_HDR_OFFSET);
+
+#undef PLATFORM_MEMSET_CNAT_LOG_PKT_DATA
+#define PLATFORM_MEMSET_CNAT_LOG_PKT_DATA
+
+/*
+ Index 0 -- SE_P2MP
+ Index 1 -- HA Destination 1
+ Index 2 -- HA Destination 2
+ Index 3 -- EXT_LOG_SRVR
+*/
+enum {
+ NODE_CGNCFG,
+ NODE_HA,
+ NODE_PD_CONFIG,
+ NODE_LOGGING,
+ NODE_TRACE_BACKUP,
+ NODE_MAX,
+};
+
+#endif /* __PLATFORM_COMMON_OVERRIDE_H__ */
diff --git a/vnet/vnet/vcgn/spp_ctx.h b/vnet/vnet/vcgn/spp_ctx.h
new file mode 100644
index 00000000000..2d3c95c8887
--- /dev/null
+++ b/vnet/vnet/vcgn/spp_ctx.h
@@ -0,0 +1,76 @@
+/*
+ *------------------------------------------------------------------
+ * spp_ctx.h - packet / context definitions
+ *
+ * Copyright (c) 2007-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __SPP_CTX_H__
+#define __SPP_CTX_H__
+
+/* Packet header / data */
+
+/* Any change to spp_ctx_t structure should be updated in vnet/buffer.h
+ * as well.
+ */
+typedef struct _spp_ctx {
+#ifdef TOBE_PORTED
+ /* Following fields are required to handle multibuffer */
+ u32 num_buffers; /* Number of buffers part of packet */
+ vlib_buffer_t *next_ctx_this_packet;
+
+ /* Following is used by non-UDP protocols */
+#define SPP_CTX_FEATURE_DATA_SIZE 16
+
+ u8 feature_data[SPP_CTX_FEATURE_DATA_SIZE];
+#endif
+
+ union { /* Roddick specific */
+ u32 roddick_info;
+ struct __tx_pkt_info { /* Used by PI to PI communication for TX */
+ u32 uidb_index:16; /* uidb_index to transmit */
+ u32 packet_type:2; /* 1-IPv4, 2-Ipv6, - 0,3 - Unused */
+ u32 ipv4_defrag:1; /* 0 - Normal, 1 - update first
+ * segment size
+ * (set by 6rd defrag node)
+ */
+
+ u32 dst_ip_port_idx:4;/* Index to dst_ip_port_table */
+ u32 from_node:4;
+ u32 calc_chksum:1;
+ u32 reserved:4;
+ } tx;
+ struct __rx_pkt_info { /* Used by PD / PI communication */
+ u32 uidb_index:16; /* uidb_index received in packet */
+ u32 packet_type:2; /* 1-IPv4, 2-Ipv6, - 0,3 - Unused */
+ u32 icmp_type:1; /* 0-ICMP query type, 1-ICMP error type */
+ u32 protocol_type:2; /* 1-TCP, 2-UDP, 3-ICMP, 0 - Unused */
+ u32 ipv4_defrag:1; /* 0 - Normal, 1 - update first
+ * segment size
+ * (set by 6rd defrag node)
+ */
+
+ u32 direction:1; /* 0-Outside, 1-Inside */
+ u32 frag:1; /*IP fragment-1, Otherwise-0*/
+ u32 option:1; /* 0-No IP option (v4) present, non-fragHdr
+ * option hdr present (v6)
+ */
+ u32 df_bit:1; /* IPv4 DF bit copied here */
+ u32 reserved1:6;
+ } rx;
+ } ru;
+} spp_ctx_t;
+
+#endif
diff --git a/vnet/vnet/vcgn/spp_platform_trace_log.c b/vnet/vnet/vcgn/spp_platform_trace_log.c
new file mode 100644
index 00000000000..2c119f06ba2
--- /dev/null
+++ b/vnet/vnet/vcgn/spp_platform_trace_log.c
@@ -0,0 +1,991 @@
+/*
+ *------------------------------------------------------------------
+ * spp_platform_trace_log.c
+ *
+ * Copyright (c) 2008-2011, 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *---------------------------------------------------------------------------
+ */
+
+#include <vlib/vlib.h>
+#include <stdio.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+#include <vlib/main.h>
+
+#include "tcp_header_definitions.h"
+#include "platform_common.h"
+#include "spp_platform_trace_log.h"
+
+#define WORD_SIZE sizeof(u32)
+
+int temperature_read_blocked = 1;
+
+spp_cnat_logger_tbl_t spp_cnat_logger_table[] =
+{
+ { CNAT_ERROR_SUCCESS,
+ 3,
+ 0,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NO_CONFIG_ERROR,
+ 3,
+ 180,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NO_VRF_RUN_ERROR,
+ 3,
+ 180,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NO_POOL_FOR_ANY_ERROR,
+ 3,
+ 180,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NO_PORT_FOR_ANY_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_BAD_INUSE_ANY_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NOT_FOUND_ANY_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_INV_PORT_FOR_DIRECT_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_BAD_INUSE_DIRECT_ERROR,
+ 3,
+ 1,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NOT_FOUND_DIRECT_ERROR,
+ 3,
+ 1,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_OUT_OF_PORT_LIMIT_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_MAIN_DB_CREATE_ERROR,
+ 0,
+ 30,
+ {""}
+ },
+ { CNAT_LOOKUP_ERROR,
+ 1,
+ 30,
+ {"Type"}
+ },
+ { CNAT_INDEX_MISMATCH_ERROR,
+ 2,
+ 30,
+ {"in2out_index",
+ "out2in_index"}
+ },
+ { CNAT_PACKET_DROP_ERROR,
+ 3,
+ 15,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_INV_UNUSED_USR_INDEX,
+ 1,
+ 10,
+ {"invalid/unused user index"}
+ },
+ { CNAT_INVALID_VRFMAP_INDEX,
+ 0,
+ 60,
+ {""}
+ },
+ { CNAT_USER_OUT_OF_PORTS,
+ 2,
+ 1800,
+ {"i-vrf",
+ "ipv4 addr"}
+ },
+ { CNAT_EXT_PORT_THRESH_EXCEEDED,
+ 2,
+ 180,
+ {"i-vrf",
+ "ipv4 address"}
+ },
+ { CNAT_EXT_PORT_THRESH_NORMAL,
+ 2,
+ 180,
+ {"vrf",
+ "ipv4 address"}
+ },
+ { CNAT_NO_EXT_PORT_AVAILABLE,
+ 0,
+ 1,
+ {"",}
+ },
+ { CNAT_SESSION_THRESH_EXCEEDED,
+ 2,
+ 1800,
+ {"vrf",
+ "ipv4 address"}
+ },
+ { CNAT_SESSION_THRESH_NORMAL,
+ 2,
+ 30, /* changed to 30 */
+ {"vrf",
+ "ipv4 address"}
+ },
+ { WQE_ALLOCATION_ERROR,
+ 0,
+ 180, /* changed to 180 */
+ {""}
+ },
+ { ERROR_PKT_DROPPED,
+ 2,
+ 60, /* changed to 60 */
+ {"spi-port",
+ "error-code"}
+ },
+ { SYSMGR_PD_KEY_CREATION_ERROR,
+ 0,
+ 30,
+ {""}
+ },
+ { SYSMGR_PD_SHMEM_ID_ERROR,
+ 0,
+ 1,
+ {""}
+ },
+ { SYSMGR_PD_SHMEM_ATTACH_ERROR,
+ 0,
+ 1,
+ {""}
+ },
+ { OCTEON_CKHUM_SKIPPED,
+ 2,
+ 60, /* changed to 60 */
+ {"version",
+ "protocol"}
+ },
+ { PK0_SEND_STATUS,
+ 1,
+ 15,
+ {"status"}
+ },
+ { CMD_BUF_ALLOC_ERR,
+ 0,
+ 60,
+ {""}
+ },
+ { SPP_CTX_ALLOC_FAILED,
+ 1,
+ 300, /* every 5 min */
+ {"node"}
+ },
+ { SPP_MAX_DISPATCH_REACHED,
+ 1,
+ 60,
+ {"node"}
+ },
+ { HA_SIGCHILD_RECV,
+ 3,
+ 1,
+ {"pid",
+ "uid",
+ "signal",}
+ },
+ { SIGACTION_ERR,
+ 0,
+ 1,
+ {""}
+ },
+ { HA_INVALID_SEQ_OR_CONFIG_OR_TYPE,
+ 2,
+ 10,
+ {"seq-id or config option",
+ "Type"}
+ },
+ { NODE_CREATION_ERROR,
+ 1,
+ 1,
+ {"node"}
+ },
+
+ { CNAT_CLI_INVALID_INPUT,
+ 4,
+ 0,
+ {"Error Type",
+ "Passed",
+ "Expected",
+ "Type"}
+ },
+ { CNAT_DUMMY_HANDLER_HIT,
+ 1,
+ 0,
+ {"Handler"}
+ },
+ { CNAT_CONFIG_ERROR,
+ 5,
+ 0,
+ {"Sub code",
+ "Param 1",
+ "Param 2",
+ "Param 3",
+ "Param 4"}
+ },
+ { CNAT_NFV9_ERROR,
+ 1,
+ 180, /* changed to 180 */
+ {"Sub code"}
+ },
+ { CNAT_CMVX_TWSI_READ_WRITE_FAIL,
+ 3,
+ 180,
+ {"Operation",
+ "Location",
+ "Data"}
+ },
+ { CNAT_TEMP_SENSOR_TIMEOUT,
+ 0,
+ 180,
+ {""}
+ },
+ { CNAT_TEMP_SENSOR_DATA_MISMATCH,
+ 2,
+ 180,
+ {"Actual",
+ "Expected"}
+ },
+ { CNAT_TEMP_SENSOR_CONFIG_FAILED,
+ 1,
+ 180,
+ {"Glik"}
+ },
+ { HA_APP_NOT_RESPONDING,
+ 2,
+ 180,
+ {"CPU",
+ "Core"}
+ },
+ { HA_DATA_PATH_TEST_FAILED,
+ 0,
+ 30,
+ {""}
+ },
+ { CNAT_WRONG_PORT_ALLOC_TYPE,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_NEW_PORT_ALLOC_ERROR,
+ 3,
+ 60,
+ {"i-vrf",
+ "ipv4 addr",
+ "port"}
+ },
+ { CNAT_INVALID_INDEX_TO_FREE_PORT,
+ 0,
+ 60,
+ {""}
+ },
+ { CNAT_DELETE_DB_ENTRY_NO_PORTMAP,
+ 0,
+ 60,
+ {""}
+ },
+ { CNAT_MAIN_DB_LIMIT_ERROR,
+ 0,
+ 180,
+ {""}
+ },
+ { CNAT_USER_DB_LIMIT_ERROR,
+ 0,
+ 180,
+ {""}
+ },
+ { CNAT_FRAG_DB_ERROR,
+ 1,
+ 180,
+ {"Type"}
+ },
+
+ { DROP_PKT_DUMP,
+ 0,
+ 20,
+ {""}
+ }
+};
+
+#define LOG_TABLE_MAX_ENTRIES \
+ (sizeof(spp_cnat_logger_table)/sizeof(spp_cnat_logger_table[0]))
+
+u32 error_code_timestamps[LOG_TABLE_MAX_ENTRIES];
+spp_timer_t sensor_timer;
+spp_trace_log_global_info_t spp_trace_log_global_info;
+spp_global_counters_t spp_global_counters;
+
+/*
+ * Logging information structures
+ */
+spp_trace_log_info_t spp_default_trace_log_info;
+spp_trace_log_info_t *spp_trace_log_info_pool;
+
+#ifdef TOBE_PORTED
+/*
+ * The following 2 functions are temporary hacks until
+ * we have RTC support from the PD nodes
+ */
+inline
+u32 spp_trace_log_get_sys_up_time_in_ms (void)
+{
+ spp_node_main_vector_t *nmv;
+ u32 sys_up_time;
+
+ nmv = spp_get_node_main_vectorized_inline();
+
+ sys_up_time = (u32) (nmv->ticks / nmv->ticks_per_ms);
+
+ return (sys_up_time);
+}
+
+inline
+u32 spp_trace_log_get_unix_time_in_seconds (void)
+{
+ spp_node_main_vector_t *nmv;
+ u32 unix_time;
+
+ nmv = spp_get_node_main_vectorized_inline();
+
+ unix_time = (u32) (nmv->ticks / nmv->ticks_per_second);
+
+ return (unix_time);
+}
+
+/*
+ * edt: * * spp_trace_log_send_queued_pkt
+ *
+ * Tries to send a logging pkt that has been queued earlier
+ * because it could not be sent due to downstream constipation
+ *
+ * Argument: spp_trace_log_info_t *trace_logging_info
+ * structure that contains the packet context
+ */
+inline
+void spp_trace_log_send_queued_pkt (spp_trace_log_info_t *trace_logging_info)
+{
+ spp_node_t *output_node;
+
+ output_node = spp_get_nodes() +
+ spp_trace_log_global_info.spp_trace_log_disp_node_index;
+
+ if (PREDICT_TRUE(output_node->sf.nused < SPP_MAXDISPATCH)) {
+ /*
+ * Move the logging context to output node
+ */
+ spp_dispatch_make_node_runnable(output_node);
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ trace_logging_info->queued_logging_context;
+
+ /*
+ * Context has been queued, it will be freed after the pkt
+ * is sent. Clear this from the logging_context_info structure
+ */
+ trace_logging_info->queued_logging_context = NULL;
+
+ } else {
+ /*
+ * Can't do much, just return, may be we can send it later
+ */
+ spp_global_counters.spp_trace_log_downstream_constipation_count++;
+ }
+}
+
+/*
+ * edt: * * spp_trace_log_send_pkt
+ *
+ * Tries to send a logging pkt. If the packet cannot be sent
+ * because of rewrite_output node cannot process it, queue
+ * it temporarily and try to send it later.
+ *
+ * Argument: spp_trace_log_info_t *trace_logging_info
+ * structure that contains the packet context
+ */
+inline
+void spp_trace_log_send_pkt (spp_trace_log_info_t *trace_logging_info)
+{
+ spp_node_t *output_node;
+
+
+ output_node = spp_get_nodes() +
+ spp_trace_log_global_info.spp_trace_log_disp_node_index;
+
+ if (PREDICT_TRUE(output_node->sf.nused < SPP_MAXDISPATCH)) {
+ /*
+ * Move the logging context to output node
+ */
+ spp_dispatch_make_node_runnable(output_node);
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ trace_logging_info->current_logging_context;
+
+ } else {
+ /*
+ * Queue the context into the logging_info structure,
+ * We will try to send it later. Currently, we will
+ * restrict to only one context queued.
+ */
+ spp_global_counters.spp_trace_log_downstream_constipation_count++;
+
+ /*
+ * Attach the current logging context which is full to the
+ * queued context list in trace_logging_info structure
+ */
+ trace_logging_info->queued_logging_context =
+ trace_logging_info->current_logging_context;
+
+ /*
+ * Whether the context is queued or not, set the current context index
+ * to EMPTY, as the earlier context can no more be used to send
+ * more logging records.
+ */
+ }
+
+ trace_logging_info->current_logging_context = NULL;
+}
+
+/*
+ * edt: * * spp_trace_log_send_pkt_always_success
+ *
+ * Tries to send a logging pkt. This cannot fail due to downstream
+ * constipation because we have already checked if the rewrite_output
+ * node can accept it.
+ *
+ * Argument: spp_trace_log_info_t *trace_logging_info
+ * structure that contains the packet context
+ *
+ * Argument: spp_node_t *output_node
+ * spp_node_t structure for rewrite_output node
+ */
+inline
+void spp_trace_log_send_pkt_always_success (
+ spp_trace_log_info_t *trace_logging_info,
+ spp_node_t *output_node)
+{
+ /*
+ * At this point we either have a current or queued logging context
+ */
+ if (PREDICT_TRUE(trace_logging_info->current_logging_context != NULL)) {
+
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ trace_logging_info->current_logging_context;
+
+ trace_logging_info->current_logging_context = NULL;
+ } else {
+ /*
+ * For queued logging context
+ */
+ output_node->sf.ctxs[output_node->sf.nused++] =
+ trace_logging_info->queued_logging_context;
+
+ trace_logging_info->queued_logging_context = NULL;
+ }
+
+ /*
+ * Move the logging context to output node
+ */
+ spp_dispatch_make_node_runnable(output_node);
+
+}
+
+/*
+ * edt: * * spp_create_trace_log_context
+ *
+ * Tries to create a logging context with packet buffer
+ * to send a new logging packet
+ *
+ * Argument: spp_trace_log_info_t *trace_logging_info
+ * structure that contains the nfv9 logging info and will store
+ * the packet context as well.
+ */
+inline
+void spp_create_trace_log_context (
+ spp_trace_log_info_t *trace_logging_info)
+{
+ spp_ctx_t *ctx;
+
+ /*
+ * If queued_logging_context_index is non-EMPTY, we already have a logging
+ * packet queued to be sent. First try sending this before allocating
+ * a new context. We can have only one active packet context per
+ * trace_logging_info structure
+ */
+ if (PREDICT_FALSE(trace_logging_info->queued_logging_context != NULL)) {
+ spp_trace_log_send_queued_pkt(trace_logging_info);
+ /*
+ * If we cannot still send the queued pkt, just return
+ * Downstream Constipation count would have increased anyway
+ */
+ if (trace_logging_info->queued_logging_context != NULL) {
+ spp_global_counters.spp_trace_log_context_creation_deferred_count++;
+ return;
+ }
+ }
+
+
+ /*
+ * No context can be allocated, return silently
+ * calling routine will handle updating the error counters
+ */
+ if (spp_ctx_alloc(&ctx, 1) < 1) {
+ spp_global_counters.spp_trace_log_context_creation_fail_count++;
+ return;
+ }
+
+ trace_logging_info->current_logging_context = ctx;
+ trace_logging_info->pkt_length = 0;
+
+ trace_logging_info->current_logging_context_timestamp =
+ spp_trace_log_get_sys_up_time_in_ms();
+
+ ctx->flags = SPP_CTX_END_OF_PACKET;
+ ctx->ru.tx.from_node = NODE_TRACE_BACKUP;
+ ctx->ru.tx.dst_ip_port_idx = EXT_TRACE_BACKUP_INDEX;
+ ctx->next_ctx_this_packet = (spp_ctx_t*) SPP_CTX_NO_NEXT_CTX;
+ ctx->current_header = &ctx->packet_data[SPP_TRACE_LOG_HDR_OFFSET];
+ ctx->current_length = 0;
+
+ trace_logging_info->log_record = 0;
+ trace_logging_info->total_record_count = 0;
+ trace_logging_info->next_data_ptr =
+ (u8 *) &ctx->packet_data[SPP_TRACE_LOG_HDR_OFFSET];
+
+}
+
+/*
+ * edt: * * spp_trace_log_add_record_create
+ *
+ * Tries to create an add record to the NFV9 packet
+ *
+ * Argument: spp_trace_log_info_t *trace_logging_info
+ * structure that contains the nfv9 logging info and will store
+ * the packet context as well.
+ */
+inline
+void spp_trace_log_add_record_create (spp_trace_log_info_t *trace_logging_info)
+{
+
+ trace_logging_info->log_header =
+ (spp_trace_log_hdr_t *) (trace_logging_info->next_data_ptr);
+
+ /*
+ * Initialize the number of traces recorded
+ */
+ trace_logging_info->log_header->num_traces =
+ spp_host_to_net_byte_order_32(0);
+
+
+ trace_logging_info->log_record =
+ (spp_trace_log_t *) (trace_logging_info->log_header + 1);
+
+ /*
+ * Update the length of the total pkt
+ */
+ trace_logging_info->pkt_length +=
+ SPP_LOG_TRACE_HEADER_LENGTH;
+
+ /*
+ * Set the data pointer beyond the trace header field
+ */
+ trace_logging_info->next_data_ptr =
+ (u8 *) (trace_logging_info->log_header + 1);
+
+}
+
+/*
+ * edt: * * spp_trace_logger
+ *
+ * Tries to log spp/cnat event/errors
+ *
+ * Argument: u8 *error_code
+ * Error code passed
+ *
+ * Argument: optional arguments
+ */
+void spp_trace_logger (u16 error_code, u16 num_args, u32 *arg)
+{
+ spp_trace_log_info_t *trace_logging_info = 0;
+ u8 i;
+
+ trace_logging_info =
+ spp_trace_log_info_pool +
+ spp_trace_log_global_info.spp_log_pool_index[SPP_LOG_LTRACE];
+
+ if (PREDICT_FALSE(trace_logging_info->current_logging_context == NULL)) {
+ spp_create_trace_log_context(trace_logging_info);
+
+ /*
+ * If still empty, return after increasing the count
+ */
+ if (PREDICT_FALSE(trace_logging_info->current_logging_context == NULL)) {
+ return;
+ }
+ }
+
+ if (PREDICT_FALSE(trace_logging_info->log_record == NULL)) {
+ spp_trace_log_add_record_create(trace_logging_info);
+ }
+
+ /*
+ * We should definitely have add_record now, no need to sanitize
+ */
+ trace_logging_info->log_record->error_code =
+ spp_host_to_net_byte_order_16(error_code);
+ trace_logging_info->log_record->num_args =
+ spp_host_to_net_byte_order_16(num_args);
+
+ for (i = 0; i < num_args; i++) {
+ trace_logging_info->log_record->arg[i] =
+ spp_host_to_net_byte_order_32(*(arg + i));
+ }
+
+ trace_logging_info->pkt_length += SPP_TRACE_LOG_RECORD_LENGTH + WORD_SIZE*num_args;
+ trace_logging_info->current_logging_context->current_length =
+ trace_logging_info->pkt_length;
+ trace_logging_info->total_record_count += 1;
+
+ trace_logging_info->next_data_ptr =
+ (u8 *) (trace_logging_info->next_data_ptr + WORD_SIZE + WORD_SIZE*num_args);
+
+ trace_logging_info->log_record =
+ (spp_trace_log_t *) (trace_logging_info->next_data_ptr);
+
+ /*
+ * Initialize the number of traces recorded
+ */
+ trace_logging_info->log_header->num_traces =
+ spp_host_to_net_byte_order_32(trace_logging_info->total_record_count);
+
+
+
+ /*
+ * If we have exceeded the packet length, let us send the
+ * packet now. There is buffer of additional bytes beyond
+ * max_pkt_length to ensure that the last add/delete record
+ * can be stored safely.
+ */
+ if (trace_logging_info->pkt_length >
+ trace_logging_info->max_length_minus_max_record_size) {
+ spp_trace_log_send_pkt(trace_logging_info);
+ }
+}
+
+
+/*
+ * edt: * * spp_trace_log_timer_handler
+ *
+ * Timer handler for sending any pending NFV9 record
+ *
+ * Argument: spp_timer_t * timer_p
+ * Timer handler structure
+ */
+inline
+void spp_trace_log_timer_handler (spp_timer_t * timer_p)
+{
+ spp_node_t *output_node;
+ spp_trace_log_info_t *trace_logging_info = 0;
+ u32 current_timestamp = spp_trace_log_get_sys_up_time_in_ms();
+ i16 sf_nused;
+
+ output_node = spp_get_nodes() +
+ spp_trace_log_global_info.spp_trace_log_disp_node_index;
+
+ sf_nused = output_node->sf.nused;
+
+ pool_foreach (trace_logging_info, spp_trace_log_info_pool, ({
+ /*
+ * Check if no more logging contexts can be queued
+ */
+ if (PREDICT_FALSE(sf_nused >= SPP_MAXDISPATCH)) {
+ break;
+ }
+
+ /*
+ * If there is a current logging context and timestamp
+ * indicates it is pending for long, send it out
+ * Also if there is a queued context send it out as well
+ */
+ if (trace_logging_info->queued_logging_context ||
+ (trace_logging_info->current_logging_context &&
+ (current_timestamp -
+ trace_logging_info->current_logging_context_timestamp)
+ > 1000)) {
+ spp_trace_log_send_pkt_always_success(trace_logging_info,
+ output_node);
+ sf_nused++;
+ }
+ }));
+
+ timer_p->expires =
+ spp_timer_in_n_ms_inline(1000); /* every 1 sec */
+ spp_timer_start(timer_p);
+
+}
+inline
+void spp_sensor_timer_handler (spp_timer_t * timer_p)
+{
+#ifdef TARGET_RODDICK
+ if (!temperature_read_blocked) {
+ Init_temperature_sensors();
+ read_octeon_sensors(TEMPERATURE_SENSOR_QUIET_MODE);
+ }
+
+ timer_p->expires =
+ spp_timer_in_n_ms_inline(60000); /* every 1 sec */
+ spp_timer_start(timer_p);
+
+#endif
+}
+void init_trace_log_buf_pool (void)
+{
+ spp_trace_log_info_t *my_spp_log_info;
+ u8 found;
+ spp_log_type_t log_type;
+
+ /*
+ * Init SPP logging info as needed, this will be done only once
+ */
+ spp_trace_log_init();
+
+ found = 0;
+
+ for (log_type = SPP_LOG_LTRACE; log_type < SPP_LOG_MAX; log_type++ ) {
+ /* Do we already have a map for this log type? */
+ pool_foreach (my_spp_log_info, spp_trace_log_info_pool, ({
+ if (my_spp_log_info->log_type == log_type) {
+ found = 1;
+ break;
+ }
+ }));
+
+ /*
+ * Entry not present
+ */
+ if (!found) {
+ pool_get(spp_trace_log_info_pool, my_spp_log_info);
+ memset(my_spp_log_info, 0, sizeof(*my_spp_log_info));
+
+ /*
+ * Make the current and head logging context indeices as EMPTY.
+ * When first logging happens, these get set correctly
+ */
+ my_spp_log_info->current_logging_context = NULL;
+ my_spp_log_info->queued_logging_context = NULL;
+
+ my_spp_log_info->log_type = log_type;
+ my_spp_log_info->max_length_minus_max_record_size =
+ SPP_TRACE_LOG_MAX_PKT_LENGTH;
+
+ spp_trace_log_global_info.spp_log_pool_index[log_type] =
+ my_spp_log_info - spp_trace_log_info_pool;
+ }
+
+ }
+
+ return;
+}
+
+
+/*
+ * one time function
+ * has to be called at the init time
+ */
+void spp_trace_log_init (void)
+{
+ if (!spp_trace_log_global_info.spp_trace_log_init_done) {
+
+#ifdef TARGET_RODDICK
+ spp_trace_log_global_info.spp_trace_log_disp_node_index =
+ spp_lookup_node_index("roddick_infra_l3_tx");
+#elif defined(TARGET_BOOSTER)
+ spp_trace_log_global_info.spp_trace_log_disp_node_index =
+ spp_lookup_node_index("booster_infra_l3_tx");
+#endif
+ ASSERT(spp_trace_log_global_info.spp_trace_log_disp_node_index != (u16)~0);
+
+ spp_trace_log_global_info.log_timer.cb_index =
+ spp_timer_register_callback(spp_trace_log_timer_handler);
+ spp_trace_log_global_info.log_timer.expires =
+ spp_timer_in_n_ms_inline(1000); /* every 1 sec */
+ spp_timer_start(&spp_trace_log_global_info.log_timer);
+
+ if (!my_core_id) {
+ sensor_timer.cb_index =
+ spp_timer_register_callback(spp_sensor_timer_handler);
+ sensor_timer.expires =
+ spp_timer_in_n_ms_inline(60000); /* every 1 sec */
+ spp_timer_start(&sensor_timer);
+ }
+
+ spp_trace_log_global_info.spp_trace_log_init_done = 1;
+
+ /*
+ * Set MSC ip_addr, port values
+ */
+#ifdef TARGET_RODDICK
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].ipv4_address =
+ vpp_boot_params.msc_ip_address;
+ switch(vpp_boot_params.octeon_number) {
+ case 0:
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].port = 0x15BF;
+ break;
+ case 1:
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].port = 0x15BF;
+ break;
+ case 2:
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].port = 0x15BF;
+ break;
+ case 3:
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].port = 0x15BF;
+ break;
+ }
+#else
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].ipv4_address = 0x01020304;
+ dst_ipv4_port_table[EXT_TRACE_BACKUP_INDEX].port = 0x15BF;
+#endif
+
+ }
+}
+
+void spp_printf (u16 error_code, u16 num_args, u32 *arg)
+{
+ u32 current_timestamp;
+ spp_node_main_vector_t *nmv;
+
+ if (PREDICT_FALSE(error_code >= LOG_TABLE_MAX_ENTRIES))
+ {
+ /* printf("Error code invalid %d, %d, %d, %d\n",
+ error_code, LOG_TABLE_MAX_ENTRIES,
+ sizeof(spp_cnat_logger_table), sizeof(spp_cnat_logger_table[0]));
+ */
+ return; /* Should not happen */
+ }
+
+ nmv = spp_get_node_main_vectorized_inline();
+ current_timestamp = nmv->ticks / nmv->ticks_per_second;
+
+ /* Check if any further hashing is required */
+
+ if (PREDICT_FALSE(error_code == DUMP_PKT_IDX)) {
+#ifdef TARGET_RODDICK || defined(TARGET_BOOSTER)
+ spp_trace_logger(error_code, num_args, arg);
+#else
+ u8 j ;
+
+ printf("PKT DUMP :: ");
+ for (j = 0 ; j < num_args; j++) {
+ printf("0x%x ", arg[j]);
+ if (j == (num_args - 1)) {
+ printf("\n");
+ }
+ }
+#endif
+ } else if (PREDICT_TRUE((current_timestamp - error_code_timestamps[error_code]) >=
+ spp_cnat_logger_table[error_code].rate_limit_time)) {
+ /* update timestamp */
+ error_code_timestamps[error_code] = current_timestamp;
+
+#ifdef TARGET_RODDICK || defined(TARGET_BOOSTER)
+ spp_trace_logger(error_code, num_args, arg);
+#else
+ u8 j ;
+
+ for (j = 0 ; j < num_args; j++) {
+ printf("%s: %d ", spp_cnat_logger_table[error_code].param_name[j], arg[j]);
+ if (j == (num_args - 1)) {
+ printf("\n");
+ }
+ }
+#endif
+ }
+}
+
+#else /* TOBE_PORTEED */
+void spp_trace_logger(u16 error_code, u16 num_args, u32 *arg)
+{
+ /* To be filled */
+}
+
+void spp_trace_log_init(void)
+{
+ /* To be filled */
+}
+
+void init_trace_log_buf_pool(void)
+{
+ /* To be filled */
+}
+
+void spp_printf(u16 error_code, u16 num_args, u32 *arg)
+{
+ /* To be filled */
+}
+
+inline
+u32 spp_trace_log_get_unix_time_in_seconds (void)
+{
+ vlib_main_t *vlib_main;
+
+ vlib_main = vlib_get_main();
+ return(vlib_time_now((vlib_main_t *) vlib_main));
+}
+
+#endif /* TOBE_PORTED */
+
diff --git a/vnet/vnet/vcgn/spp_platform_trace_log.h b/vnet/vnet/vcgn/spp_platform_trace_log.h
new file mode 100644
index 00000000000..cffe5a09956
--- /dev/null
+++ b/vnet/vnet/vcgn/spp_platform_trace_log.h
@@ -0,0 +1,357 @@
+/*
+ *------------------------------------------------------------------
+ * spp_platform_trace_log.h
+ *
+ * Copyright (c) 2009-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __SPP_PLATFORM_TRACE_LOG_H__
+#define __SPP_PLATFORM_TRACE_LOG_H__
+
+#include <stdio.h>
+#include <vppinfra/vec.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/hash.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/clib.h>
+
+#include "spp_ctx.h"
+#include "spp_timers.h"
+
+
+typedef enum {
+ SPP_LOG_LTRACE,
+ SPP_LOG_MAX
+} spp_log_type_t;
+
+typedef struct {
+ u32 num_traces;
+} spp_trace_log_hdr_t;
+
+typedef struct {
+ u16 error_code;
+ u16 num_args;
+ u32 arg[0];
+} spp_trace_log_t;
+
+#define DUMP_PKT_IDX 61
+#define OCTEON_SENSOR_READ 62
+
+typedef enum {
+ CNAT_ERROR_SUCCESS,
+ CNAT_NO_CONFIG_ERROR,
+ CNAT_NO_VRF_RUN_ERROR,
+ CNAT_NO_POOL_FOR_ANY_ERROR,
+ CNAT_NO_PORT_FOR_ANY_ERROR,
+ CNAT_BAD_INUSE_ANY_ERROR,
+ CNAT_NOT_FOUND_ANY_ERROR,
+ CNAT_INV_PORT_FOR_DIRECT_ERROR,
+ CNAT_BAD_INUSE_DIRECT_ERROR,
+ CNAT_NOT_FOUND_DIRECT_ERROR,
+ CNAT_OUT_OF_PORT_LIMIT_ERROR,
+ CNAT_MAIN_DB_CREATE_ERROR,
+ CNAT_LOOKUP_ERROR,
+ CNAT_INDEX_MISMATCH_ERROR,
+ CNAT_PACKET_DROP_ERROR,
+ CNAT_INV_UNUSED_USR_INDEX,
+ CNAT_INVALID_VRFMAP_INDEX,
+ CNAT_USER_OUT_OF_PORTS,
+ CNAT_EXT_PORT_THRESH_EXCEEDED,
+ CNAT_EXT_PORT_THRESH_NORMAL,
+ CNAT_NO_EXT_PORT_AVAILABLE,
+ CNAT_SESSION_THRESH_EXCEEDED,
+ CNAT_SESSION_THRESH_NORMAL,
+ WQE_ALLOCATION_ERROR,
+ ERROR_PKT_DROPPED,
+ SYSMGR_PD_KEY_CREATION_ERROR,
+ SYSMGR_PD_SHMEM_ID_ERROR,
+ SYSMGR_PD_SHMEM_ATTACH_ERROR,
+ OCTEON_CKHUM_SKIPPED,
+ PK0_SEND_STATUS,
+ CMD_BUF_ALLOC_ERR,
+ SPP_CTX_ALLOC_FAILED,
+ SPP_MAX_DISPATCH_REACHED,
+ HA_SIGCHILD_RECV,
+ SIGACTION_ERR,
+ HA_INVALID_SEQ_OR_CONFIG_OR_TYPE,
+ NODE_CREATION_ERROR,
+ CNAT_CLI_INVALID_INPUT, /* new adds as part of CSCto04510, see sub codes below */
+ CNAT_DUMMY_HANDLER_HIT, /* Has sub codes , see spp_dummy_handler_sub_cdes_t */
+ CNAT_CONFIG_ERROR, /* has subcodes-see spp_config_error_sub_codes_t below */
+ CNAT_NFV9_ERROR, /* Has sub codes see spp_nfv9_error_sub_codes_t below */
+ CNAT_CMVX_TWSI_READ_WRITE_FAIL, /* Hassub codes see spp_cmvx_error_sub_codes_t */
+ CNAT_TEMP_SENSOR_TIMEOUT,
+ CNAT_TEMP_SENSOR_DATA_MISMATCH,
+ CNAT_TEMP_SENSOR_CONFIG_FAILED,
+ HA_APP_NOT_RESPONDING,
+ HA_DATA_PATH_TEST_FAILED,
+ CNAT_WRONG_PORT_ALLOC_TYPE,
+ CNAT_NEW_PORT_ALLOC_ERROR,
+ CNAT_INVALID_INDEX_TO_FREE_PORT,
+ CNAT_DELETE_DB_ENTRY_NO_PORTMAP,
+ CNAT_MAIN_DB_LIMIT_ERROR,
+ CNAT_USER_DB_LIMIT_ERROR,
+ CNAT_FRAG_DB_ERROR, /* see spp_frag_db_error_sub_codes_t below */
+
+ DROP_PKT_DUMP,
+ CNAT_NAT64_SYSTEM_LIMIT_ERROR,
+ CNAT_ERROR_MAX
+} spp_error_codes_t;
+
+typedef enum {
+
+ TCP_MSS_INVALID_IVRF = 10, /* 1 param - vrf id */
+ NFV9_LOG_INVALID_IP_OR_PORT = 20, /* 2 params - nfv9 server ip and port */
+ NFV9_LOG_INVALID_PARAMS_OTHERS, /* 3 params, ref rate, time out, path mtu */
+ NFV9_LOG_PATH_MTU_TOO_SMALL, /* 1 param, path mtu passed */
+ NFV9_LOG_CANNOT_ADD_VRF_NOT_FOUND, /* 1 param, in vrf id */
+
+ VRF_MAP_ADDR_POOL_START_ADDR_GT_END_ADDR = 30, /* 2 params, start and end addr */
+ VRF_MAP_ADDR_POOL_ADDR_POOL_TOO_LARGE, /* 2 params, start and end addr */
+ VRF_MAP_ADDR_POOL_INVALID_IN_OR_OUT_VRF, /* 2 params, in vrf and out vrf */
+ VRF_MAP_ADDR_POOL_TOO_LARGE_FOR_CORE, /* 2 params, pool size, core instance */
+ VRF_MAP_DEL_POOL_START_ADDR_GT_END_ADDR, /* 2 params, start and end addr */
+ VRF_MAP_DEL_POOL_ADDR_POOL_NOT_FOUND, /* 2 params, start and end addr */
+ VRF_MAP_DEL_POOL_VRF_MAP_EMPTY, /* 2 params, start and end addr */
+
+ ADD_SVI_ADDR_INVALID_VRF = 40, /* 2 params, vrf passed and ipv4 addr */
+ ADD_SVI_INDEX_INVALID_VRF, /* 2 params, vrf, uidb_index */
+
+ MAPPED_STAT_PORT_INVALID_OUTPUT_PARAMS = 50,
+ /* 3 params, out vrf, out ip, out port */
+ MAPPED_STAT_PORT_UDP_PORT_POLARITY_MISMATCH, /* 2 params, in port and out port */
+ MAPPED_STAT_PORT_IN_VRF_MAP_EMPTY, /* 1 param, in vrf id passed */
+ MAPPED_STAT_PORT_VRF_MAP_NOT_IN_S_RUN, /* 1 param, vrf map status */
+ MAPPED_STAT_PORT_INVALID_OUT_VRF_ID, /* 1 param, out vrf id passed */
+ MAPPED_STAT_PORT_FAILED_TO_ADD_STAT_PORT, /* 4 params, in vrf, in ip, in port, error code */
+
+ STAT_PORT_INVALID_IN_PARAMS = 60, /* 4 params, in vrf, in ip, in port, proto */
+ STAT_PORT_FAILED_TO_ADD_STAT_PORT, /* 4 params, in vrf, in ip, in port, error code */
+ STAT_PORT_CONFIG_IN_USE, /* 4 params, in vrf, in ip, in port, proto */
+
+ DEL_STAT_PORT_IN_VRF_MAP_EMPTY = 70, /* 1 param, in vrf id passed */
+ DEL_STAT_PORT_INVALID_IN_PARAMS, /* 4 params, in vrf, in ip, in port, proto */
+ DEL_STAT_PORT_CANNOT_DELETE_NO_ENTRY, /* 4 params, in vrf, in ip, in port, proto */
+ DEL_STAT_PORT_CANNOT_DELETE_NOT_STATIC_PORT, /* 4 params, in vrf, in ip, in port, proto*/
+
+ XLAT_SVI_CFG_INVALID_INDEX = 80, /* 1 param - uidb_index */
+ XLAT_WRONG_V6_PREFIX_MASK, /* 1 param - v6 prefix mask */
+ XLAT_INVALID_XLAT_ID_ERROR, /* 1 param - id */
+
+ V6RD_INVALID_6RD_ID_ERROR = 90, /*1 param - id */
+ MAPE_INVALID_MAPE_ID_ERROR = 100 /* param - id */
+} spp_config_error_sub_codes_t;
+
+typedef enum {
+ CONFIG_DUMMY,
+ CONFIG_DUMMY_MAX,
+ SHOW_DUMMY,
+ SHOW_DUMMY_MAX,
+ DEBUG_DUMMY,
+ DEBUG_DUMMY_MAX
+} spp_dummy_handler_sub_cdes_t;
+
+typedef enum {
+ CMVX_READ,
+ CMVX_WRITE
+} spp_cmvx_error_sub_codes_t;
+
+typedef enum {
+ FRAG_DB_INVALID_BUCKET,
+ FRAG_DB_NO_ENTRY
+} spp_frag_db_error_sub_codes_t;
+
+typedef enum {
+ CLI_INVALID_PAYLOAD_SIZE,
+ CLI_INVALID_MSG_ID
+} spp_cli_error_sub_codes_t;
+
+typedef enum {
+ NFV9_DOWNSTREAM_CONGESTION,
+ NFV9_FAILED_TO_CREATE_CONTEXT
+} spp_nfv9_error_sub_codes_t;
+
+typedef struct spp_cnat_logger_tbl_t_ {
+ u16 error_code; // The thread id stored by software
+ u16 num_args;
+ u16 rate_limit_time; // If we need to rate_limit logging
+ u8 param_name[7][32];// Parameter name for debug purposes
+} spp_cnat_logger_tbl_t;
+
+extern spp_cnat_logger_tbl_t spp_cnat_logger_table[];
+
+/*
+ * This corresponds to the length of the IMETRO SHIM Header for RODDICK
+ * For non-roddick cases, introduce an Ethernet header as well
+ */
+#if defined(RODDICK)
+#define SPP_TRACE_LOG_SHIM_HDR_OFFSET 8
+#define SPP_TRACE_LOG_ENCAPS_OFFSET 0
+#else
+#define SPP_TRACE_LOG_SHIM_HDR_OFFSET 0
+#define SPP_TRACE_LOG_ENCAPS_OFFSET 16
+#endif
+
+#define SPP_LOG_TRACE_HEADER_LENGTH \
+ (sizeof(spp_trace_log_hdr_t))
+
+
+#define SPP_TRACE_LOG_IP_HDR_OFFSET \
+ (SPP_TRACE_LOG_ENCAPS_OFFSET + \
+ SPP_TRACE_LOG_SHIM_HDR_OFFSET)
+
+
+#define SPP_TRACE_LOG_UDP_HDR_OFFSET \
+ (SPP_TRACE_LOG_IP_HDR_OFFSET + sizeof(ipv4_header))
+
+#define SPP_TRACE_LOG_HDR_OFFSET \
+ (SPP_TRACE_LOG_UDP_HDR_OFFSET + sizeof(udp_hdr_type_t))
+
+#define SPP_TRACE_LOG_RECORD_LENGTH 4
+
+/*
+ * Let us put the maximum length of the log data to be 1400
+ */
+#define SPP_TRACE_LOG_MAX_PKT_LENGTH 800
+
+/* Structures and defines to store log info for MSC */
+#define SPP_TRACE_LOG_INVALID_LOGGING_INDEX 0xffffffff
+
+/*
+ * This structure stores the Logging information on per LOG TYPE
+ * basis. This structure is allocated from a pool and index
+ * to this structure based on log type
+ */
+typedef struct {
+ /*
+ * This field determines the maximum size of the Netflow V9 information
+ * that can be stored in a logging packet
+ */
+ u16 max_length_minus_max_record_size;
+
+ u32 sequence_num; /* Sequence number of the logging packet */
+ u32 last_pkt_sent_count;
+ u16 pkt_length; /* Length of the currently NFv9 information */
+ u16 log_record_length; /* Length of add record */
+ u16 total_record_length; /* number of trace records */
+ u16 total_record_count;
+ spp_log_type_t log_type;
+ /*
+ * current logging context
+ */
+ spp_ctx_t *current_logging_context;
+
+ /*
+ * Timestamp in UNIX seconds corresponding to when the current
+ * logging packet was created
+ */
+ u32 current_logging_context_timestamp;
+
+ /*
+ * Queued logging context waiting to be sent to the l3 infra node
+ */
+ spp_ctx_t *queued_logging_context;
+
+ /*
+ * Headers corresponding to various records in this
+ * current nfv9 logging context
+ */
+ spp_trace_log_t *log_record;
+ spp_trace_log_hdr_t *log_header;
+ u8 *next_data_ptr;
+
+} spp_trace_log_info_t;
+
+typedef struct {
+ /*
+ * spp_ctx_alloc() call failed
+ */
+ u64 spp_trace_log_context_creation_fail_count;
+
+ /*
+ * Cannot send the existing logging pkt, so cannot create
+ * any additional packets for logging purposes
+ */
+ u64 spp_trace_log_context_creation_deferred_count;
+
+ /*
+ * Cannot send the existing logging pkt due to cnat_rewrite_output
+ * superframe being full.
+ */
+ u64 spp_trace_log_downstream_constipation_count;
+} spp_global_counters_t;
+
+
+/*
+ * Global structure for SPP LOGS
+ */
+typedef struct {
+ /* A timer structure to periodically send log packets
+ * that have been waiting to be full for a long time. This will
+ * ensure event/error logs don't get delayed too much before they
+ * are sent to the MSC.
+ */
+ spp_timer_t log_timer;
+
+ /*
+ * Node index corresponding to the infra L3 output node
+ * to which the nfv9 logging node will send the packet
+ */
+ u16 spp_trace_log_disp_node_index;
+
+ /*
+ * Whether we have initialized the NFv9 information
+ */
+ u8 spp_trace_log_init_done;
+
+ /*
+ * pool index in global pool based on log type
+ */
+ u32 spp_log_pool_index[SPP_LOG_MAX];
+
+} spp_trace_log_global_info_t;
+
+
+extern spp_timer_t sensor_timer;
+extern spp_trace_log_info_t spp_default_trace_log_info;
+extern spp_trace_log_info_t *spp_trace_log_info_pool;
+
+extern spp_trace_log_global_info_t spp_trace_log_global_info;
+
+void spp_trace_logger(u16 error_code, u16 num_args, u32 *arg);
+void spp_trace_log_init(void);
+void init_trace_log_buf_pool(void);
+void spp_printf(u16 error_code, u16 num_args, u32 *arg);
+
+/*
+ * The following 2 functions are temporary hacks until
+ * we have RTC support from the PD nodes
+ */
+inline
+u32 spp_trace_log_get_sys_up_time_in_ms (void);
+
+inline
+u32 spp_trace_log_get_unix_time_in_seconds (void);
+
+enum {
+ TEMPERATURE_SENSOR_TEST_MODE,
+ TEMPERATURE_SENSOR_QUIET_MODE,
+};
+
+extern int temperature_read_blocked;
+
+void read_octeon_sensors(u8 mode);
+void Init_temperature_sensors();
+#endif /* __SPP_PLATFORM_TRACE_LOG_H__ */
diff --git a/vnet/vnet/vcgn/spp_timers.h b/vnet/vnet/vcgn/spp_timers.h
new file mode 100644
index 00000000000..afb0147b2ed
--- /dev/null
+++ b/vnet/vnet/vcgn/spp_timers.h
@@ -0,0 +1,139 @@
+/*
+ *------------------------------------------------------------------
+ * spp_timers.h
+ *
+ * Copyright (c) 2008-2009 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+#ifndef __SPP_TIMERS_H__
+#define __SPP_TIMERS_H__
+
+
+typedef struct d_list_el_ {
+ struct d_list_el_ *next;
+ struct d_list_el_ *prev;
+} d_list_el_t;
+
+/*
+ * d_list_init
+ */
+
+static inline void d_list_init (d_list_el_t *headp)
+{
+ headp->prev = headp->next = headp;
+}
+
+/*
+ * d_list_init - add at head of list
+ */
+
+static inline void d_list_add_head (d_list_el_t *headp,
+ d_list_el_t *elp)
+{
+ ASSERT(elp->prev == elp); /* multiple enqueue, BAD! */
+ ASSERT(elp->next == elp);
+
+ elp->next = headp->next;
+ headp->next = elp;
+ elp->prev = elp->next->prev;
+ elp->next->prev = elp;
+}
+
+/*
+ * d_list_add_tail - add element at tail of list
+ */
+static inline void d_list_add_tail (d_list_el_t *headp,
+ d_list_el_t *elp)
+{
+ ASSERT(elp->prev == elp); /* multiple enqueue, BAD! */
+ ASSERT(elp->next == elp);
+
+ headp = headp->prev;
+
+ elp->next = headp->next;
+ headp->next = elp;
+ elp->prev = elp->next->prev;
+ elp->next->prev = elp;
+}
+
+/*
+ * d_list_rem_head - removes first element from list
+ */
+static inline d_list_el_t *d_list_rem_head (d_list_el_t *headp)
+{
+ d_list_el_t *elp;
+
+ elp = headp->next;
+ if (elp == headp)
+ return (NULL);
+ headp->next = elp->next;
+ elp->next->prev = elp->prev;
+
+ elp->next = elp->prev = elp;
+ return (elp);
+}
+
+/*
+ * d_list_rem_elem - removes specific element from list.
+ */
+static inline void d_list_rem_elem (d_list_el_t *elp)
+{
+ d_list_el_t *headp;
+
+ headp = elp->prev;
+
+ headp->next = elp->next;
+ elp->next->prev = elp->prev;
+ elp->next = elp->prev = elp;
+}
+
+#define TIMER_BKTS_PER_WHEEL 128 /* power of 2, please */
+#define TIMER_NWHEELS 4
+
+typedef struct {
+ i32 curindex; /* current index for this wheel */
+ d_list_el_t *bkts; /* vector of bucket listheads */
+} spp_timer_wheel_t;
+
+
+typedef struct {
+ u64 next_run_ticks; /* Next time we expire timers */
+ spp_timer_wheel_t **wheels; /* pointers to wheels */
+} spp_timer_axle_t;
+
+
+typedef struct {
+ d_list_el_t el;
+ u16 cb_index;
+ u16 flags;
+ u64 expires;
+} spp_timer_t;
+
+#define SPP_TIMER_RUNNING 0x0001
+
+
+/*
+ * prototypes
+ */
+void spp_timer_set_ticks_per_ms(u64);
+void spp_timer_axle_init (spp_timer_axle_t *ta);
+void spp_timer_expire(spp_timer_axle_t *ta, u64 now);
+void spp_timer_final_init(void);
+
+void spp_timer_start(spp_timer_t *tp);
+void spp_timer_start_axle(spp_timer_axle_t *ta, spp_timer_t *tp);
+void spp_timer_stop(spp_timer_t *tp);
+u16 spp_timer_register_callback (void (*fp)(spp_timer_t *));
+
+#endif /* __SPP_TIMERS_H__ */
diff --git a/vnet/vnet/vcgn/tcp_header_definitions.h b/vnet/vnet/vcgn/tcp_header_definitions.h
new file mode 100644
index 00000000000..02920bcc8ee
--- /dev/null
+++ b/vnet/vnet/vcgn/tcp_header_definitions.h
@@ -0,0 +1,1582 @@
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * Filename: tcp_header_definitions.h
+ *
+ * Description: Layer 2, 3, 4 definitions and header types
+ *
+ * Assumptions and Constraints:
+ *
+ * Copyright (c) 2012-2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-----------------------------------------------------------------------------
+ */
+
+#ifndef __TCP_HEADER_DEFINITIONS_H__
+#define __TCP_HEADER_DEFINITIONS_H__
+
+/*
+ * A general list of Layer 3 protocols, used by many Layer 2 encaps.
+ *
+ * formerly:
+ * TYPE_IP TYPE_IP10MB
+ * TYPE_ARP TYPE_RFC826_ARP
+ * TYPE_RARP TYPE_REVERSE_ARP
+ * TYPE_MPLS TYPE_TAGSWITCH
+ */
+#define TYPE_IP 0x0800
+#define TYPE_IP_V6 0x86DD
+#define TYPE_ARP 0x0806
+#define TYPE_RARP 0x8035
+#define TYPE_MPLS 0x8847
+#define TYPE_CDP 0x2000
+#define TYPE_CGMP 0x2001
+#define TYPE_LACP 0x8808 /* 802.3ad */
+#define TYPE_CLNS 0xFEFE
+
+#define TYPE_PPPOE_SESSION 0x8864 /* PTA plus */
+#define TYPE_PPPOE_DISCOVERY 0x8863 /* PTA plus */
+
+/*
+ * for atm arp handling
+ */
+#define IN_ATM_ARP_BIT 0x0008
+
+/*
+ * The Layer 2 header structures.
+ */
+
+
+/*
+** HDLC
+*/
+
+typedef struct hdlc_hdr_type {
+ u16 addr;
+ u16 type;
+ u8 data[0];
+} hdlc_hdr_type;
+
+#define HDLC_ADDR_CMD 0x0F00
+#define HDLC_HDR_LEN 4
+#define HDLC_BROADCAST_BIT 31
+#define TYPE_KEEP 0x8035
+
+#define HDLC_CLNS (HDLC_ADDR_CMD<<16|TYPE_CLNS)
+#define HDLC_CDP (HDLC_ADDR_CMD<<16|TYPE_CDP)
+#define HDLC_MPLS (HDLC_ADDR_CMD<<16|TYPE_MPLS)
+#define HDLC_IP (HDLC_ADDR_CMD<<16|TYPE_IP)
+#define HDLC_IP_V6 (HDLC_ADDR_CMD<<16|TYPE_IP_V6)
+#define HDLC_KEEPALIVE_CMD (HDLC_ADDR_CMD<<16|TYPE_KEEP)
+
+/*
+** PPP
+*/
+
+typedef struct ppp_comp_hdr_type {
+ union {
+ u8 ppp_u8[4];
+ u16 ppp_u16[2];
+ u32 ppp_u32;
+ } ppp_comp_u;
+} ppp_comp_hdr_type;
+
+#define PPP_STATION 0xFF03
+#define PPP_STATION_LEN 0x2
+#define PPP_ENDPROTO 0x01
+#define PPP_NOT_ENDPROTO 0xfffffffe
+#define PPP_CONTROL_PROTOCOL_MASK 0x8000
+#define PPP_CONTROL_PROTOCOL_BIT 15
+#define PPP_CSCO_LEN 4
+#define PPP_RFC1661_LEN 2
+#define PPP_RFC1661_COMP_LEN 1
+
+#define TYPE_PPP_IP 0x0021
+#define TYPE_PPP_IP_V6 0x0057
+#define TYPE_PPP_MPLS_UNICAST 0x0281
+#define TYPE_PPP_MPLS_CONTROL 0x8281
+#define TYPE_PPP_CLNS 0x0023
+#define TYPE_PPP_CDP 0x0207
+
+#define TYPE_PPP_IPCP 0x8021
+#define TYPE_PPP_LCP 0xC021
+#define TYPE_PPP_PAP 0xC023
+#define TYPE_PPP_LQR 0xC025
+#define TYPE_PPP_CHAP 0xC223
+
+
+#define TYPE_PPP_LCP_ECHO_REQUEST 0x09
+/*
+** MultiLink PPP
+*/
+
+#define MLPPP_FLAGS_FIELD_LEN 4
+#define MLPPP_BEGIN_MASK 0x80000000
+#define MLPPP_END_MASK 0x40000000
+#define MLPPP_BEGIN_END_MASK (MLPPP_BEGIN_MASK|MLPPP_END_MASK)
+#define MLPPP_BEGIN_END_SHIFT 30
+#define MLPPP_SEQUENCE_NUM_MASK 0x00FFFFFF
+#define MLPPP_MC_CLASS_ID_MASK 0x3C000000
+#define MLPPP_MC_CLASS_SHIFT 26
+
+#define TYPE_PPP_MULTILINK 0x003D
+
+/* these are needed in the micro-code, for optimizations */
+#define TYPE_PPP_FULL_IP_4 0xff030021
+#define TYPE_PPP_FULL_IP_3 0xff0321
+#define TYPE_PPP_FULL_IP_2 0x0021
+#define TYPE_PPP_FULL_IP_1 0x21
+
+#define MLPPP_BEGIN_END_MASK_BYTE 0xC0
+#define MLPPP_BEGIN_BIT 7
+#define MLPPP_END_BIT 6
+#define MLPPP_MC_CLASS_ID_MASK_BYTE 0x3C
+#define MLPPP_MC_CLASS_ID_SHIFT_BYTE 2
+
+#define MLPOA_BEGIN_END_SHIFT 24
+
+/*
+** Ethernet ARPA
+*/
+
+
+typedef struct ethernet_arpa_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type;
+ u8 data[0];
+} ethernet_arpa_hdr_type;
+
+typedef struct extension_802p3_type {
+ u16 type;
+ u8 ctl;
+ u8 data[0];
+} extension_802p3_type;
+
+typedef struct ethernet_802p3_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 length;
+ extension_802p3_type extension;
+} ethernet_802p3_hdr_type;
+
+
+typedef struct ethernet_vlan_802p3_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type1;
+ u16 vlan_id;
+ u16 length;
+ extension_802p3_type extension;
+} ethernet_vlan_802p3_hdr_type;
+
+#define MIN_ETHERNET_PKT_LEN 60
+#define MAX_ETHERNET_PKT_LEN 1500
+#define ETHERNET_ARPA_HDR_LEN 14
+#define ETHERNET_TYPE_FIELD_SIZE 2
+
+
+/*
+** Ethernet 802.1q (VLAN)
+*/
+
+typedef struct ethernet_vlan_hdr_type {
+ u8 dest_addr[6];
+ u8 src_addr[6];
+ u16 type1;
+ u16 vlan_hdr;
+ u16 type2;
+ u8 data[0];
+} ethernet_vlan_hdr_type;
+
+
+/*
+** Ethernet 802.1.q-in-q (QinQ)
+*/
+
+typedef struct ethernet_qinq_hdr_type {
+ u8 dest_addr[6];
+ u8 src_addr[6];
+ u16 type1;
+ u16 vlan_hdr1;
+ u16 type2;
+ u16 vlan_hdr2;
+ u16 type3;
+ u8 data[0];
+} ethernet_qinq_hdr_type;
+
+
+/*
+** Ethernet 802.3ad EtherChannel control
+*/
+
+typedef struct ethernet_lacp_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type;
+ u16 LAcmd;
+ u8 data[0];
+} ethernet_lacp_hdr_type;
+
+
+/*
+** Ethernet 802.1 Bridge (spanning tree) PDU
+*/
+
+typedef struct ethernet_bpdu_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 more[0];
+} ethernet_bpdu_hdr_type;
+
+#define ETH_BPDU_DSAP 0x42
+#define ETH_BPDU_SSAP 0x42
+#define ETH_BPDU_CONTROL 0x03
+#define ETH_BPDU_MATCH 0x424203
+
+
+/************************************************************/
+/* PTA PLUS ETHERNET ENCAPSULATIONS */
+/*
+ * PPPoEoARPA 20 bytes
+ */
+typedef struct ethernet_pppoe_arpa_hdr_type {
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type;
+ /* pppoe hdr at begining of enet payload */
+ u16 vtc; /* version(4b), type(4b) and code(8b) fields */
+ u16 sid;
+ u16 len;
+ u8 ppp_header[0]; /* PPP header start, no ff03 field present */
+} ethernet_pppoe_arpa_hdr_type;
+
+typedef struct pppoe_hdr_type {
+ /* pppoe hdr at begining of enet payload */
+ u16 vtc; /* version(4b), type(4b) and code(8b) fields */
+ u16 sid;
+ u16 len;
+ u8 ppp_header[0]; /* PPP header start, no ff03 field present */
+} pppoe_hdr_type;
+
+/*
+** PPPoEoVLAN (802.1p or 802.1q) 24 bytes
+*/
+typedef struct ethernet_pppoe_vlan_hdr_type {
+ u8 dest_addr[6];
+ u8 src_addr[6];
+ u16 type1;
+ u16 vlan_hdr;
+ u16 type2;
+ /* pppoe hdr at begining of enet payload */
+ u16 vtc; /* version(4b), type(4b) and code(8b) fields */
+ u16 sid;
+ u16 len;
+ u8 ppp_header[0]; /* PPP header start, no ff03 field present */
+} ethernet_pppoe_vlan_hdr_type;
+
+/*
+** PPPoEoQinQ 28 bytes
+*/
+typedef struct ethernet_pppoe_qinq_hdr_type {
+ u8 dest_addr[6];
+ u8 src_addr[6];
+ u16 type1;
+ u16 vlan_hdr1;
+ u16 type2;
+ u16 vlan_hdr2;
+ u16 type3;
+ /* pppoe hdr at begining of enet payload */
+ u16 vtc; /* version(4b), type(4b) and code(8b) fields */
+ u16 sid;
+ u16 len;
+ u8 ppp_header[0]; /* PPP header start, no ff03 field present */
+} ethernet_pppoe_qinq_hdr_type;
+
+#define ETH_PPPOE_ARPA_HDR_LEN sizeof(ethernet_pppoe_arpa_hdr_type)
+#define ETH_PPPOE_VLAN_HDR_LEN sizeof(ethernet_pppoe_vlan_hdr_type)
+#define ETH_PPPOE_QINQ_HDR_LEN sizeof(ethernet_pppoe_qinq_hdr_type)
+#define PPPOE_HDR_LEN 6
+/* End PTA PLUS ETHERNET ENCAPSULATIONS */
+/****************************************************************/
+
+
+
+#define TYPE_DOT1Q 0x8100
+#define DOT1Q_HDR_LEN 18
+#define DOT1Q_VLAN_ID_MASK 0x0FFF
+#define DOT1Q_VLAN_ID_RES_0 0x0000
+#define DOT1Q_VLAN_ID_RES_4095 0x0FFF
+#define DOT1Q_ARPA_INDEX DOT1Q_VLAN_ID_RES_0
+
+#define TYPE_QINQ_91 0x9100
+#define TYPE_QINQ_92 0x9200
+#define TYPE_QINQ_88A8 0x88A8
+#define QINQ_HDR_LEN 22
+
+/*
+ * 802.1p support
+ */
+#define DOT1P_VLAN_COS_MASK 0xE000
+#define DOT1P_VLAN_COS_SHIFT 13
+#define DOT1P_MAX_COS_VALUE 7
+
+/*
+** Frame Relay
+*/
+
+/*
+ * formerly:
+ * TYPE_FR_IETF_IPV4 ENCAPS_FR_IETF
+ * TYPE_FR_CISCO_IPV4 ENCAPS_FR_CISCO
+ * TYPE_FR_ISIS ENCAPS_FR_ISIS
+ *
+ * FR_LMI_DLCI_CISCO LMI_DLCI_CISCO
+ * FR_LMI_DLCI_IETF LMI_DLCI_ITUANSI
+ */
+
+typedef struct frame_relay_hdr_type {
+ u16 address;
+ u16 control_nlpid;
+ u8 data[0];
+} frame_relay_hdr_type;
+
+typedef struct fr_snap_hdr_type {
+ u16 address;
+ u8 control;
+ u8 pad;
+ u8 nlpid;
+ u8 oui[3];
+ u16 protocol_id;
+} fr_snap_hdr_type;
+
+#define FR_ADDR_LEN 2
+#define FR_CTL_NLPID_LEN 2
+#define FR_HDR_LEN (FR_ADDR_LEN+FR_CTL_NLPID_LEN)
+
+/*
+ * These defines are for the FR-SNAP header.
+ * The SNAP header is set up solely so that we can
+ * identify ARP packets, which look like this:
+ *
+ * control pad nlpid oui protocol_id
+ * 03 00 80 00 00 00 0806
+ */
+#define FR_ARP_CONTROL 0x03
+#define FR_ARP_PAD 0x00
+#define FR_ARP_NLPID 0x80
+#define FR_ARP_OUI_0 0x00
+#define FR_ARP_OUI_1 0x00
+#define FR_ARP_OUI_2 0x00
+/*
+ * these are used only in the tmc code
+ */
+#define FR_NLPID_OUI_LEN 4
+#define FR_ARP_CONTROL_PAD 0x0300
+#define FR_ARP_NLPID_OUI 0x80000000
+
+
+#define FR_DLCI_UPPER_MASK 0xFC00
+#define FR_DLCI_UPPER_SHIFT 6
+#define FR_DLCI_LOWER_MASK 0x00F0
+#define FR_DLCI_LOWER_SHIFT 4
+
+/*
+ * Defines for converting a DLCI for insertion into a synthesized FR address
+ * field for FRoMPLS disposition.
+
+ * bit 8 7 6 5 4 3 2 1
+ * +-------------------------------+
+ * | Flag |
+ * | 0 1 1 1 1 1 1 0 |
+ * +-------------------------------+
+ * | Upper DLCI |C/R| 0 |
+ * +-------------------------------+
+ * | Lower DLCI | F | B | DE| 1 |
+ * +-------------------------------+
+ * | |
+ * :Frame relay information field :
+ * : (i.e.payload) :
+ * | |
+ * +-------------------------------+
+ * | FCS (2 or 4 octets) |
+ * | |
+ * +-------------------------------+
+ * | Flag |
+ * | 0 1 1 1 1 1 1 0 |
+ * +-------------------------------+
+ *
+ * a-With 10 bits for the DLCI
+ */
+#define FR_DLCI_TO_HDR_UPPER_MASK 0x3f0
+#define FR_DLCI_TO_HDR_UPPER_SHIFT (10-4)
+#define FR_DLCI_TO_HDR_LOWER_MASK 0xf
+#define FR_DLCI_TO_HDR_LOWER_SHIFT 4
+
+#define TYPE_FR_IETF_IP 0x03CC
+#define TYPE_FR_IETF_IP_V6 0x038E
+#define TYPE_FR_CISCO_IP 0x0800
+#define TYPE_FR_CISCO_IP_V6 0x86DD
+#define TYPE_FR_ISIS 0x0383
+#define TYPE_FR_SNAP0PAD 0x0380
+#define TYPE_FR_SNAP1PAD 0x0300
+#define TYPE_FR_FRF12 0x03B1
+#define TYPE_FR_MLP 0x03CF
+#define TYPE_FR_EEK 0x8037
+
+#define FR_LMI_DLCI_CISCO 1023
+#define FR_LMI_DLCI_IETF 0
+
+#define FR_NOT_NOT_NOT 0
+#define FR_NOT_NOT_DE 1
+#define FR_NOT_BECN_NOT 2
+#define FR_NOT_BECN_DE 3
+#define FR_FECN_NOT_NOT 4
+#define FR_FECN_NOT_DE 5
+#define FR_FECN_BECN_NOT 6
+#define FR_FECN_BECN_DE 7
+
+#define FR_FECN_BECN_DE_MASK 0x000E
+#define FR_FECN_BECN_DE_SHIFT 1
+
+/* Address field extension bit for standard 2-byte FR address field */
+#define FR_EA1_MASK 0x0001
+#define FR_EA1_MASK_BIT 0
+
+/*
+ * these are needed in the micro-code, for optimizations
+ */
+
+/* the bit position (in the address field) of the LSB of the DLCI */
+#define FR_DLCI_LS_BIT 4
+
+
+/*
+**
+** MultiLink Frame Relay
+**
+*/
+
+typedef struct mlfr_hdr_type {
+ u16 frag_hdr;
+ u16 address;
+ u16 control_nlpid;
+ u8 data[0];
+} mlfr_hdr_type;
+
+/*
+ * LIP frames have B, E and C set--the other
+ * bits in the frag_hdr field are irrelevant.
+ *
+ * NOTE: Injected LIP packets have a frag_hdr of 0xE100.
+ *
+ */
+#define MLFR_LIP_FRAME 0xE100
+#define MLFR_LIP_MASK 0xE000
+#define MLFR_FRAG_HDR_LEN 2
+
+#define MLFR_BEGIN_MASK 0x8000
+#define MLFR_END_MASK 0x4000
+#define MLFR_BEGIN_END_MASK (MLFR_BEGIN_MASK|MLFR_END_MASK)
+#define MLFR_BEGIN_END_SHIFT 14
+
+#define MLFR_SEQ_NUM_HI_MASK 0x1E00
+#define MLFR_SEQ_NUM_HI_SHIFT 1
+#define MLFR_SEQ_NUM_LO_MASK 0x00FF
+
+/*
+ * these are needed in the micro-code, for optimizations
+ */
+#define MLFR_BEGIN_END_MASK_BYTE 0xC0
+
+
+/*
+ * FRF.12 definitions
+ */
+typedef struct frf12_hdr_type_ {
+ u16 address;
+ u16 control_nlpid;
+ u16 frag_hdr;
+ u8 data[0];
+} frf12_hdr_type;
+
+#define FRF12_FRAG_HDR_LEN sizeof(frf12_hdr_type)
+
+#define FRF12_BEGIN_MASK 0x8000
+#define FRF12_END_MASK 0x4000
+#define FRF12_BEGIN_END_MASK (FRF12_BEGIN_MASK|FRF12_END_MASK)
+#define FRF12_BEGIN_END_SHIFT 8
+
+#define FRF12_SEQ_NUM_HI_MASK 0x1E00
+#define FRF12_SEQ_NUM_HI_SHIFT 1
+#define FRF12_SEQ_NUM_LO_MASK 0x00FF
+#define FRF12_BEGIN_END_MASK_BYTE 0xC0
+
+
+
+/*
+**
+** MLP over Frame Relay
+** The ppp hdr can be either a
+** an MLP hdr or a PPP hdr
+**
+** MLP can be compressed or not:
+** a) 0xff03003d
+** b) 0x003d
+** c) 0x3d
+** followed by:
+** 1 byte with begin/end bits
+** 3 bytes of a sequence #
+**
+** PPP can be also be compressed or not.
+** Only these will be fwded:
+** a) 0xff030021
+** b) 0xff0321
+** c) 0x0021
+** d) 0x21
+**
+**
+*/
+typedef struct mlpofr_hdr_type {
+ u16 address;
+ u16 control_nlpid;
+ u8 ppp_header[0];
+} mlpofr_hdr_type;
+
+/*
+** ATM -
+*/
+
+/*
+ * channel_handle is defined as follows:
+ *
+ * bits 15 = reserved (must be 0)
+ * bits 14 - 0 = channel handle
+ *
+ *
+ * flags is a bitfield defined as follows:
+ *
+ * bits 15 - 13 = proto (PPPoA RFC1661 = 0,
+ * PPPoE = 1,
+ * RBE = 2,
+ * PPPoA Cisco = 3,
+ * MLPoATM RFC1661 = 4,
+ * MLPoATM Cisco = 5,
+ * Reserved = 6-7)
+ * bit 12 = encap (MUX=0,
+ * SNAP=1)
+ * bits 11 - 6 = reserved (must be 0)
+ * bits 5 - 3 = pkt_type (AAL5 pkt = 0,
+ * Raw cell (includes F4 OAM) = 1,
+ * F5 segment OAM cell = 2
+ * F5 end-to-end OAM cell = 3
+ * Reserved = 4-7)
+ * bit 2 = EFCI (congestion indication)
+ * bit 1 = reserved (must be 0)
+ * bit 0 = CLP (cell loss priority)
+ */
+
+typedef struct apollo_atm_generic_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+} apollo_atm_generic_hdr_type;
+
+typedef struct apollo_atm_aal5_snap_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 oui[3];
+ u16 type;
+ u8 data[0];
+} apollo_atm_aal5_snap_hdr_type;
+
+typedef struct atm_aal5_snap_hdr_type {
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 oui[3];
+ u16 pid;
+ u16 pad;
+ u8 data[0];
+} atm_aal5_snap_hdr_type;
+
+
+typedef struct apollo_atm_aal5_snap_hdr1_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 oui0;
+ u8 oui1;
+ u8 oui2;
+ u16 type;
+ u8 data[0];
+} apollo_atm_aal5_snap_hdr1_type;
+
+typedef struct apollo_atm_aal5_clns_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u16 type;
+ u16 data;
+} apollo_atm_aal5_clns_hdr_type;
+
+typedef struct apollo_atm_aal5_ilmi_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 data[0];
+} apollo_atm_aal5_ilmi_hdr_type;
+
+typedef struct apollo_atm_aal5_mux_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 data[0];
+} apollo_atm_aal5_mux_hdr_type;
+
+typedef struct apollo_atm_oam_f4_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ /*
+ * gcf_vpi_vci_pt_clp is a bitfield defined as follows:
+ *
+ * bits 31 - 28 = GCF
+ * bits 27 - 20 = VPI
+ * bits 19 - 4 = VCI
+ * bits 3 - 1 = PT
+ * bit 0 = CLP
+ */
+ u32 gcf_vpi_vci_pt_clp;
+ u8 data[0];
+} apollo_atm_oam_f4_hdr_type;
+
+#define APOLLO_ATM_OAM_F4_HDR_PT_MASK 0xE
+#define APOLLO_ATM_OAM_F4_HDR_PT_SHIFT 1
+
+typedef struct apollo_atm_oam_f5_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 data[0];
+} apollo_atm_oam_f5_hdr_type;
+
+#define APOLLO_IRONBUS_EXT_LESS_PROTO 0xFFFF0FFF
+#define APOLLO_CHANNEL_HANDLE_MASK 0xFFFF
+#define APOLLO_PKT_TYPE_MASK 0x0038
+#define APOLLO_PKT_TYPE_SHIFT 3
+#define APOLLO_FLAG_CLP_MASK 0x0001
+#define APOLLO_FLAG_CLP_BIT 0
+
+#define APOLLO_CHANNEL_HANDLE_RES_0 0x0000
+/*
+ * The 1 byte HEC field is removed by the line card.
+ */
+#define APOLLO_F4_RX_CELL_SIZE 52
+#define APOLLO_F5_RX_CELL_SIZE 52
+
+#define APOLLO_ATM_PACKET_TYPE_AAL5 0
+#define APOLLO_ATM_PACKET_TYPE_F4 1
+#define APOLLO_ATM_PACKET_TYPE_F5_SEG 2
+#define APOLLO_ATM_PACKET_TYPE_F5_E_TO_E 3
+#define APOLLO_ATM_PACKET_TYPE_4 4
+#define APOLLO_ATM_PACKET_TYPE_5 5
+#define APOLLO_ATM_PACKET_TYPE_6 6
+#define APOLLO_ATM_PACKET_RESERVED 7
+
+#define APOLLO_AAL5_MUX_IP_HDR_LEN 4
+#define APOLLO_AAL5_SNAP_HDR_LEN 12
+
+#define APOLLO_RCV_IRON_BUS_EXT_LEN 4
+#define APOLLO_TX_IRON_BUS_EXT_LEN 8
+
+/*
+ * MLPoA type definitions
+ */
+#define MLPOA_CISCO_HDR 0xFF03
+#define MLPOA_SNAP_HDR_LEN 4
+#define MLPOA_CISCO_HDR_LEN 2
+
+/************************************************************/
+/* PTA PLUS ATM ENCAPSULATIONS */
+
+/* RBE header 28 bytes*/
+typedef struct apollo_atm_aal5_llcsnap_rbe_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 oui[3];
+ u16 pid;
+ u16 pad;
+ /* enet header within */
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type;
+ u8 data[0]; /* start of IP */
+} apollo_atm_aal5_llcsnap_rbe_hdr_type;
+
+/* PPPoEoA header 34 bytes*/
+typedef struct apollo_atm_aal5_llcsnap_pppoe_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 oui[3];
+ u16 pid;
+ u16 pad;
+ /* enet header within */
+ u8 daddr[6];
+ u8 saddr[6];
+ u16 type;
+ /* pppoe hdr at begining of enet payload */
+ u16 vtc; /* version(4b), type(4b) and code(8b) fields */
+ u16 sid;
+ u16 len;
+ u8 ppp_header[0]; /* PPP header start, no ff03 field present */
+} apollo_atm_aal5_llcsnap_pppoe_hdr_type;
+
+
+/* PPPoA MUX 4 bytes*/
+typedef struct apollo_atm_aal5_mux_pppoa_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 ppp_header[0];
+} apollo_atm_aal5_mux_pppoa_hdr_type;
+
+
+/* PPPoA SNAP LLC 8 bytes */
+typedef struct apollo_atm_aal5_llcsnap_pppoa_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 nlpid;
+ u8 ppp_header[0];
+} apollo_atm_aal5_llcsnap_pppoa_hdr_type;
+
+/* MLPoA MUX (generic) */
+typedef struct apollo_atm_aal5_mux_mlpoa_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 ppp_header[0];
+} apollo_atm_aal5_mux_mlpoa_hdr_type;
+
+/* MLPoA SNAP LLC */
+typedef struct apollo_atm_aal5_llcsnap_mlpoa_hdr_type {
+ u16 channel_handle;
+ u16 flags;
+ u8 dsap;
+ u8 ssap;
+ u8 control;
+ u8 nlpid;
+ u8 ppp_header[0];
+} apollo_atm_aal5_llcsnap_mlpoa_hdr_type;
+
+
+#define PPPOA_SNAPLLC_HDR_LEN sizeof(apollo_atm_aal5_llcsnap_pppoa_hdr_type)
+#define PPPOA_MUX_HDR_LEN sizeof(apollo_atm_aal5_mux_pppoa_hdr_type)
+#define PPPOE_SNAPLLC_HDR_LEN sizeof(apollo_atm_aal5_llcsnap_pppoe_hdr_type)
+#define RBE_SNAPLLC_HDR_LEN sizeof(apollo_atm_aal5_llcsnap_rbe_hdr_type)
+
+/* End PTA PLUS ATM ENCAPSULATIONS */
+/****************************************************************/
+
+#define LLCSNAP_PID_DOT3_NOFCS 0x0007
+
+/*
+** the SNAP header
+*/
+
+/*
+ * Note that some of these definitions are split
+ * up along certain word or half word boundaries
+ * to help expediate the TMC code.
+ */
+#define LLC_SNAP_HDR_DSAP 0xAA
+#define LLC_SNAP_HDR_SSAP 0xAA
+#define LLC_SNAP_HDR_CONTROL 0x03
+#define LLC_SNAP_HDR_OUI_0 0x00
+#define LLC_SNAP_HDR_OUI_1 0x00
+#define LLC_SNAP_HDR_OUI_2 0x00
+#define LLC_SNAP_HDR_OUI_2_CDP 0x0C
+
+#define LLC_SNAP_HDR_DSAP_SSAP 0xAAAA
+#define LLC_SNAP_HDR_DSAP_SSAP_CTRL_OUI0 0xAAAA0300
+#define LLC_SNAP_HDR_CONTROL_OUI 0x03000000
+#define LLC_SNAP_HDR_OUI1_OUI2_CDP 0x000C2000
+
+
+
+/*
+** SRP
+*/
+
+/*
+ * The v2_gen_hdr is a 2-byte field that contains the following:
+ *
+ * [ ttl | ring_id | mode | priority | parity ]
+ * bits 8 1 3 3 1
+ */
+typedef struct srp_hdr_type {
+ u16 v2_gen_hdr;
+ u8 dest_addr[6];
+ u8 src_addr[6];
+ u16 protocol;
+ u8 data[0];
+} srp_hdr_type;
+
+#define SRP_HDR_LEN 16
+
+#define SRP_IB_CHANNEL_CONTROL 0x0000
+#define SRP_IB_CHANNEL_DATA_HI 0x0001
+#define SRP_IB_CHANNEL_DATA_LO 0x0002
+
+#define SRP_RING_ID_MASK 0x0080
+#define SRP_RING_ID_BIT 7
+
+#define SRP_MODE_BITS_MASK 0x0070
+#define SRP_MODE_BITS_SHIFT 4
+#define SRP_MODE_CONTROL_TOPOLOGY 4
+#define SRP_MODE_CONTROL_IPS 5
+#define SRP_MODE_DATA 7
+
+#define SRP_PRIORITY_BITS_MASK 0x000E
+#define SRP_PRIORITY_BITS_SHIFT 1
+#define SRP_PRIORITY_HIGH 7
+#define SRP_PRIORITY_PAK_PRIORITY 6
+
+/* this is for the tmc code */
+#define SRP_INV_PRIORITY_BITS_MASK 0xFFF1
+
+#define SRP_PROT_CONTROL_TOPOLOGY 0x2007
+#define SRP_PROT_CONTROL_IPS 0x2007
+
+/* this is for the tmc code */
+#define SRP_TRUE 1
+#define SRP_FALSE 0
+
+/*
+** MPLS
+*/
+#define MPLS_EOS_BIT 0x00000100
+#define MPLS_EOS_SHIFT 8
+#define MPLS_LABEL_SIZE 4
+#define MAX_MPLS_LABEL_STACK 6
+#define MPLS_LABEL_MASK 0xfffff000
+#define MPLS_LABEL_SHIFT 12
+#define MPLS_TTL_MASK 0x000000ff
+#define MPLS_EXP_MASK 0x00000e00
+#define MPLS_EXP_SHIFT 9
+#define MPLS_EXP_TTL_MASK 0x00000eff
+
+
+
+typedef union _layer2 {
+ hdlc_hdr_type hdlc;
+ ppp_comp_hdr_type ppp;
+ ethernet_arpa_hdr_type eth_arpa;
+ ethernet_vlan_hdr_type eth_vlan;
+ ethernet_qinq_hdr_type eth_qinq;
+ ethernet_lacp_hdr_type eth_lacp;
+ ethernet_bpdu_hdr_type eth_bpdu;
+ ethernet_802p3_hdr_type eth_802p3;
+ ethernet_vlan_802p3_hdr_type eth_vlan_802p3;
+ ethernet_pppoe_arpa_hdr_type eth_pppoe_arpa; /* PTA plus */
+ ethernet_pppoe_vlan_hdr_type eth_pppoe_vlan; /* PTA plus */
+ ethernet_pppoe_qinq_hdr_type eth_pppoe_qinq; /* PTA plus */
+ frame_relay_hdr_type frame_relay;
+ fr_snap_hdr_type fr_snap;
+ mlfr_hdr_type mlfr;
+ mlpofr_hdr_type mlpofr;
+ frf12_hdr_type frf12;
+ apollo_atm_generic_hdr_type atm_generic;
+ apollo_atm_aal5_snap_hdr_type atm_aal5_snap;
+ apollo_atm_aal5_snap_hdr1_type atm_aal5_snap1;
+ apollo_atm_aal5_clns_hdr_type atm_aal5_clns;
+ apollo_atm_aal5_ilmi_hdr_type atm_aal5_ilmi;
+ apollo_atm_aal5_mux_hdr_type atm_aal5_mux;
+ apollo_atm_oam_f4_hdr_type atm_oam_f4;
+ apollo_atm_oam_f5_hdr_type atm_oam_f5;
+ apollo_atm_aal5_llcsnap_rbe_hdr_type atm_aal5_rbe_snapllc; /* PTA plus */
+ apollo_atm_aal5_llcsnap_pppoe_hdr_type atm_aal5_pppoe_snapllc; /* PTA plus */
+ apollo_atm_aal5_mux_pppoa_hdr_type atm_aal5_pppoa_mux; /* PTA plus */
+ apollo_atm_aal5_llcsnap_pppoa_hdr_type atm_aal5_pppoa_snapllc; /* PTA plus */
+ apollo_atm_aal5_mux_mlpoa_hdr_type mlpoa_generic;
+ apollo_atm_aal5_llcsnap_mlpoa_hdr_type mlpoa_snapllc;
+ srp_hdr_type srp;
+} layer2_t;
+
+/*
+ * Define the Common OAM cell format - F4 & F5 cells
+ * For F4 cells:
+ * VPI == User VPI
+ * VCI == (3 == Segment), (4 == End-to-End)
+ *
+ * For F5 cells:
+ * VPI == User VPI
+ * VCI == User VCI
+ * PT == (100 == Segment, 101 == End-to-End)
+ *
+ * OAM Cell Type & Function Type:
+ *
+ * OAM_TYPE = (0001 == Fault management)
+ * OAM_FUNC == (0000 == AIS, 0001 == RDI, 0100 == CC,
+ * 1000 == loopback)
+ *
+ * OAM_TYPE = (0010 == Performance management)
+ * OAM_FUNC == (0000 == Forward Monitoring(FM),
+ * 0001 == Backward monitoring(BR),
+ * 0010 == Monitoring & reporting (FM+BR))
+ *
+ * OAM_TYPE = (1000 == Activation/Deactivation)
+ * OAM_FUNC == (0000 == Performance Monitoring,
+ * 0001 == Continuity Check)
+ *
+ * OAM_TYPE = (1111 == Sytem Management)
+ * OAM_FUNC == (0001 == Security - non-real-time,
+ * 0010 == Security - real-time)
+ *
+ */
+#define ATM_OAM_FAULT_MGMT 0x1 /* OAM Fault mgmt. code */
+#define ATM_OAM_PRFRM_MGMT 0x2 /* performance mgmt code */
+#define ATM_OAM_ACT_DEACT 0x8 /* OAM Activation/Deactivation
+ code */
+#define ATM_OAM_SYSTEM_MGMT 0xF /* System Management code */
+
+#define ATM_OAM_AIS_FUNC 0x0 /* AIS function type */
+#define ATM_OAM_RDI_FUNC 0x1 /* RDI function type */
+#define ATM_OAM_CC_FUNC 0x4 /* OAM CC FM function code */
+#define ATM_OAM_LOOP_FUNC 0x8 /* Loopback function type */
+
+#define ATM_OAM_F5_SEGMENT 0x4 /* Segment function */
+#define ATM_OAM_F5_ENDTOEND 0x5 /* End-to-End function */
+#define ATM_OAM_F4_SEGMENT 0x3 /* Segment function */
+#define ATM_OAM_F4_ENDTOEND 0x4 /* End-to-End function */
+#define ATM_OAM_F4_PTI_ZERO 0x0 /* PTI=0 for F4 OAM */
+
+typedef struct atm_oam_hdr_t_ {
+ unsigned oam_gfc:4; /* GFC */
+ unsigned oam_vpi:8; /* VPI */
+ unsigned oam_vci_ms:4; /* VCI (Most Significant Bits) */
+
+ unsigned oam_vci_ls:12; /* VCI (Least Significant Bits) */
+ unsigned oam_pt:3; /* Payload Type */
+ unsigned oam_clp:1; /* Cell Loss Priority */
+ u8 data[0];
+} atm_oam_hdr_t;
+
+typedef struct atm_oam_type_func_t_ {
+ u8 oam_type:4;
+ u8 oam_func:4;
+ u8 data[0];
+} atm_oam_type_func_t;
+
+/*
+** IP Version 4 header
+*/
+
+/*
+ * version_hdr_len_words [7-4] IP Header Version
+ * [3-0] IP Header Length in 32-bit words
+ * tos Type of Service
+ * total_len_bytes Total IP datagram length in bytes
+ * (including IP header)
+ * identification Unique fragmentation identifier
+ * frag_flags_offset [15-13] Fragmentation flags
+ * [12-0] Fragmentation Offset
+ * ttl Time To Live
+ * protocol_id Protocol Identifier
+ * checksum 16-bit 1's complement IP Header checksum
+ * src_addr IP Source Address
+ * dest_addr IP Destination Address
+ */
+typedef struct ipv4_header {
+ u8 version_hdr_len_words;
+ u8 tos;
+ u16 total_len_bytes;
+ u16 identification;
+ u16 frag_flags_offset;
+ u8 ttl;
+ u8 protocol;
+ u16 checksum;
+ u32 src_addr;
+ u32 dest_addr;
+ u8 data[0];
+} ipv4_header;
+
+/*OPTIONS PACKET TYPE
+ * +-+-+-+-+-+-+-+-+
+ * |C| CL| OP |
+ * +-+-+-+-+-+-+-+-+
+ */
+typedef struct ipv4_options {
+ u8 copy :1 ;
+ u8 op_class :2 ;
+ u8 option :5 ;
+ u8 pad ;
+}ipv4_options;
+
+#define LOOSE_SOURCE_ROUTE 131
+#define STRICT_SOURCE_ROUTE 137
+#define IPV4_NO_OPTIONS_HDR_LEN (sizeof(ipv4_header))
+#define IPV4_VERSION 4
+#define IPV4_HEADER_LENGTH_WORDS 5
+#define IPV4_VERSION_HDR_LEN_FIELD ((u8) 0x45)
+#define IPV4_HEADER_LENGTH_WORDS 5
+#define IPV4_MIN_HEADER_LENGTH_BYTES 20
+#define IP_HDR_LEN sizeof(ipv4_header)
+#define IPV4_VERSION_VALUE_SHIFT 4
+
+#define IPV4_FRAG_OFFSET_MASK (0x1fff)
+#define IPV4_FRAG_MF_MASK (0x2000)
+#define IPV4_FRAG_MF_SHIFT (13)
+
+/* 0.0.0.0 */
+#define IP_BOOTP_SOURCE_ADDRESS 0
+/* 255.255.255.255 */
+#define IP_LIMITED_BROADCAST_ADDRESS 0xFFFFFFFF
+
+/*
+ * IPv4 header - version & length fields
+ */
+#define IP_VER_LEN 0x45
+#define IP_VER 0x4
+#define IP_MIN_LEN 0x5
+#define IP_VER_MASK 0xf0
+#define IP_LEN_MASK 0x0f
+
+/*
+ * IPv4 header - TOS field
+ */
+#define PS_IP_TOS_MASK 0xff
+#define IP_PRECEDENCE_SHIFT 5 /* shift value up to precedence bits */
+#define IP_DSCP_SHIFT 2 /* shift value up to dscp bits */
+
+#define IP_TOS_PRECEDENCE 0xe0 /* mask of precedence in tos byte */
+#define IP_TOS_NO_PRECEDENCE 0x1f
+#define IP_TOS_LOW_DELAY 8 /* values must be shifted 1 bit */
+#define IP_TOS_HIGH_TPUT 4 /* before using */
+#define IP_TOS_HIGH_RELY 2
+#define IP_TOS_LOW_COST 1
+#define IP_TOS_NORMAL 0
+#define IP_TOS_MASK 0x1e /* mask of tos in tos byte */
+#define IP_TOS_MBZ_MASK 0x01 /* mask for MZB bit in tos byte */
+#define IP_TOS_DSCP 0xfc /* mask for dscp in tos byte */
+#define IP_TOS_NO_DSCP 0x03
+
+#define IP_TOS_METRIC_TYPES 8
+#define IP_TOS_SHIFT 1
+
+#define IP_TOS_PRECEDENCE_MASK (IP_TOS_PRECEDENCE | IP_TOS_MASK)
+
+/*
+ * IP TOS Precedence values (High order 3 bits)
+ */
+#define TOS_PREC_NET_CONTROL 0xe0
+#define TOS_PREC_INET_CONTROL 0xc0
+#define TOS_PREC_CRIT_ECP 0xa0
+#define TOS_PREC_FLASH_OVER 0x80
+#define TOS_PREC_FLASH 0x60
+#define TOS_PREC_IMMEDIATE 0x40
+#define TOS_PREC_PRIORITY 0x20
+#define TOS_PREC_ROUTINE 0x00
+#define TOS_PREC_ILLEGAL 0xff /* invalid precedence value */
+
+#define TOS_PREC_NET_CONTROL_NUM 7
+#define TOS_PREC_INET_CONTROL_NUM 6
+#define TOS_PREC_CRIT_ECP_NUM 5
+#define TOS_PREC_FLASH_OVER_NUM 4
+#define TOS_PREC_FLASH_NUM 3
+#define TOS_PREC_IMMEDIATE_NUM 2
+#define TOS_PREC_PRIORITY_NUM 1
+#define TOS_PREC_ROUTINE_NUM 0
+
+
+
+/*
+ * IPv4 header - flags and fragment offset fields
+ */
+#define IP_FRAG_OFFSET_MASK 0x1fff
+
+
+#define IP_FRAG_MORE_MASK 0x2000
+#define IP_FRAG_DF_MASK 0x4000
+#define IP_FRAG_UNDEF_MASK 0x8000
+#define IP_FRAG_NO_DF_SET 0x0000
+
+/* bit definitions for fragment flags */
+#define IP_FRAG_MORE_BIT 13
+#define IP_FRAG_DF_BIT 14
+#define IP_FRAG_UNDEF_BIT 15
+
+/*
+ * IPv4 header - TTL field
+ */
+#define TTL_DEFAULT 255
+#define TTL_1 1
+#define TTL_2 2
+#define TTL_255 255
+
+
+/*
+ * IPv4 header - protocol field
+ *
+ * ICMP_PROT 1 ICMP
+ * IGMP_PROT 2 group management
+ * GGP_PROT 3 GGP
+ * IPINIP_PROT 4 IPv4 in IPv4 encapsulation
+ * TCP_PROT 6 TCP
+ * EGP_PROT 8 EGP
+ * IGRP_PROT 9 IGRP
+ * UDP_PROT 17 UDP
+ * HMP_PROT 20 HMP
+ * RDP_PROT 27 RDP
+ * IPV6_INIP_PROT 41 IPV6 in IPv4 encapsulation
+ * RSVP_PROT 46 RSVP
+ * GRE_PROT 47 GRE
+ * ESP_PROT 50 ESP
+ * AHP_PROT 51 AHP
+ * SDNS0_PROT 53 SNDS
+ * NHRP_PROT 54 NHRP
+ * SDNS1_PROT 55 SDNS1
+ * HELLO_PROT 63 HELLO
+ * ND_PROT 77 ND
+ * EONIP_PROT 80 CLNS over IP
+ * VINES_PROT 83 Banyan Vines
+ * NEWIGRP_PROT 88 IGRP
+ * OSPF_PROT 89 OSPF
+ * FST_RSRB_PROT 90 RSRB
+ * FST_DLSW_PROT 91 DLSW
+ * NOSIP_PROT 94 KA9Q/NOS compatible IP over IP
+ * PIM_PROT 103 PIMv2
+ * PCP_PROT 108 PCP
+ * PGM_PROT 113 PGM
+ * MAX_PROT 113 maximum protocol number in the above list,
+ * used in creating case registry
+ */
+#define ICMP_PROT 1
+#define IGMP_PROT 2
+#define GGP_PROT 3
+#define IPINIP_PROT 4
+#define TCP_PROT 6
+#define EGP_PROT 8
+#define IGRP_PROT 9
+#define UDP_PROT 17
+#define HMP_PROT 20
+#define RDP_PROT 27
+#define IPV6_INIP_PROT 41
+#define RSVP_PROT 46
+#define GRE_PROT 47
+#define ESP_PROT 50
+#define AHP_PROT 51
+#define SDNS0_PROT 53
+#define NHRP_PROT 54
+#define SDNS1_PROT 55
+#define HELLO_PROT 63
+#define ND_PROT 77
+#define EONIP_PROT 80
+#define VINES_PROT 83
+#define NEWIGRP_PROT 88
+#define OSPF_PROT 89
+#define FST_RSRB_PROT 90
+#define FST_DLSW_PROT 91
+#define NOSIP_PROT 94
+#define PIM_PROT 103
+#define PCP_PROT 108
+#define PGM_PROT 113
+#define MAX_PROT 113
+
+/*Well Known Application ports */
+#define FTP_PORT 21 /* For control connection */
+/*
+ * TCP header
+ */
+typedef struct tcp_hdr_type {
+ u16 src_port;
+ u16 dest_port;
+ u32 seq_num;
+ u32 ack_num;
+ u8 hdr_len;
+ u8 flags;
+ u16 window_size;
+ u16 tcp_checksum;
+ u16 urgent_pointer;
+ u8 option_data[0];
+} tcp_hdr_type;
+
+#define TCP_FLAG_FIN 0x01
+#define TCP_FLAG_SYN 0x02
+#define TCP_FLAG_RST 0x04
+#define TCP_FLAG_PUSH 0x08
+#define TCP_FLAG_ACK 0x10
+#define TCP_FLAG_URG 0x20
+#define TCP_FLAG_ECE 0x40
+#define TCP_FLAG_CWR 0x80
+
+/*
+ * TCP Option
+ */
+typedef struct tcp_option_s {
+ u8 kind;
+ u8 length;
+ u8 data[0];
+} tcp_option_t;
+
+#define TCP_END_OPTIONS_LIST 0
+#define TCP_OPTION_NOP 1
+#define TCP_OPTION_MSS 2
+#define TCP_OPTION_WINDOW_SCALE 3
+#define TCP_OPTION_SACK_PERMITTED 4
+#define TCP_OPTION_SACK_DATA 5
+#define TCP_OPTION_ECHO 6
+#define TCP_OPTION_ECHO_REPLY 7
+#define TCP_OPTION_TSOPT 8
+/*
+ 9 2 Partial Order Connection Permitted. RFC 1693
+ 10 3 Partial Order Service Profile. RFC 1693
+ 11 6 CC, Connection Count. RFC 1644
+ 12 6 CC.NEW RFC 1644
+ 13 6 CC.ECHO RFC 1644
+ 14 3 TCP Alternate Checksum Request. RFC 1146
+ 15 Variable. TCP Alternate Checksum Data. RFC 1146
+ 16 Skeeter.
+ 17 Bubba.
+ 18 3 Trailer Checksum Option.
+*/
+#define TCP_OPTION_MD5_SIGNATURE 19
+/*
+ 20 SCPS Capabilities.
+ 21 Selective Negative Acknowledgements.
+ 22 Record Boundaries.
+ 23 Corruption experienced.
+ 24 SNAP.
+ 25
+ 26 TCP Compression Filter.
+*/
+#define TCP_OPTION_QUICK_START 27
+
+#define TCP_OPTION_NUM_MAX 27
+
+#ifdef TARGET_CISCO
+#define tcp_printf(format_str, params...) //printf(format_str, ## params)
+#else
+#define tcp_printf(format_str, params...) printf(format_str, ## params)
+#endif
+
+typedef struct udp_hdr_type {
+ u16 src_port;
+ u16 dest_port;
+ u16 udp_length;
+ u16 udp_checksum;
+} udp_hdr_type_t;
+
+#define TYPE_IPV6 0x86dd
+#define TYPE_IPV4 0x0800
+
+/*
+ * version_trafficclass_flowlabel [31:28] IP Header Version,
+ [27:20] traffic_class,
+ [19:0] flow_label[20]
+ * payload_length Length of packet in bytes excluding header size(s)
+ * next_header Identifies the type of header following the IPv6 header
+ * hop_limit Decremented by 1 by each forwarding node, packet discarded when zero
+ * src_addr IPv6 Source Address
+ * dst_addr IPv6 Destination Address
+ */
+typedef struct ipv6_header {
+ u32 version_trafficclass_flowlabel;
+ u16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+ u32 src_addr[4];
+ u32 dst_addr[4];
+ u8 data[0];
+} ipv6_header_t;
+
+#define IPV6_HDR_LEN 40
+#define IPV6_HDR_LEN_WORDS 10
+#define IPV6_FLABLE_MASK 0x000FFFFF
+#define IPV6_MIN_PATH_MTU (1280)
+
+#define IPV6_GET_IP_VER(ih) ((clib_net_to_host_u32((ih) \
+ ->version_trafficclass_flowlabel) >> 28) & 0xf)
+#define IPV6_GET_TOS(ih) ((clib_net_to_host_u32((ih) \
+ ->version_trafficclass_flowlabel) >> 20) & 0xff)
+#define IPV6_GET_FLOW_LABEL(ih) ((clib_net_to_host_u32((ih) \
+ ->version_trafficclass_flowlabel)) & 0xfffff)
+
+#define IPV6_VERSION_VALUE (6)
+#define IPV6_VERSION_VALUE_SHIFT (28)
+#define IPV6_TRAFFIC_CLASS_VALUE_SHIFT (20)
+#define IPV6_TRAFFIC_CLASS_VALUE_MASK (0xff)
+
+#define IPV6_PROTO_HOPOPTS 0
+#define IPV6_PROTO_TCP 6
+#define IPV6_PROTO_UDP 17
+#define IPV6_PROTO_IPV6 41
+#define IPV6_PROTO_ROUTING 43
+#define IPV6_PROTO_FRAGMENT 44
+#define IPV6_PROTO_DESTOPTS 60
+#define IPV6_PROTO_ESP 50
+#define IPV6_PROTO_AH 51
+#define IPV6_PROTO_ICMPV6 58
+#define IPV6_PROTO_NONE 59
+
+/* standard v6 extension header are 2 tytes
+ * one byte next header
+ * one byte header length
+ */
+
+typedef struct ipv6_frag_header {
+ u8 next_header;
+ u8 reserved;
+ u16 frag_offset_res_m;
+ u32 identification;
+} ipv6_frag_header_t;
+
+#define IPV6_FRAG_HDR_LEN (sizeof(ipv6_frag_header_t))
+
+#define IPV6_FRAG_OFFSET_MASK (0xFFF8)
+#define IPV6_FRAG_OFFSET_SHIFT (3)
+#define IPV6_FRAG_MORE_FRAG_MASK (0x0001)
+
+#define IPV6_TOS_SHIFT 20
+#define IPV6_TOS_SHIFT_HLF_WD 4
+#define IPV6_NEXT_HDR_SHIFT 8
+
+typedef struct ipv6_routing_header {
+ u8 next_header;
+ u8 hdr_ext_len;
+ u8 routing_type;
+ u8 segments_left;
+ u8 data[0];
+} ipv6_routing_header_t;
+#define IPV6_ROUTING_HDR_LEN (sizeof(ipv6_routing_header_t))
+
+typedef struct ipv6_hop_header {
+ u8 next_header;
+ u8 hdr_ext_len;
+ u8 options[0];
+} ipv6_hop_header_t;
+#define IPV6_HOP_LEN (sizeof(ipv6_hop_header_t))
+
+typedef struct ipv6_dest_opt_header {
+ u8 next_header;
+ u8 hdr_ext_len;
+ u8 options[0];
+} ipv6_dest_opt_header_t;
+#define IPV6_DESTOPT_LEN (sizeof(ipv6_dest_opt_header_t))
+
+
+/* Definition of ICMP header */
+typedef struct icmp_v4_s {
+ u8 type;
+ u8 code;
+ u16 checksum;
+ u16 identifier;
+ u16 sequence;
+} icmp_v4_t;
+
+#define ICMPV4_HDR_SIZE (sizeof(icmp_v4_t))
+#define ICMPV4_ECHOREPLY 0 /* Type: echo reply */
+#define ICMPV4_ECHO 8 /* Type: echo request */
+
+#define ICMPV4_UNREACHABLE 3 /* Type: destination unreachable */
+#define ICMPV4_UNRNET 0 /* Code: Net unreachable */
+#define ICMPV4_UNRHOST 1 /* Code: host unreachable */
+#define ICMPV4_UNRPROT 2 /* Code: protocol unreachable */
+#define ICMPV4_UNRPORT 3 /* Code: port unreachable */
+#define ICMPV4_UNRFRAG 4 /* Code: frag req DF set */
+#define ICMPV4_UNRADMIN 13 /* Code: administratively prohib. */
+#define ICMPV4_SOURCEROUTE_FAILED 5 /* Code: administratively prohib. */
+
+#define ICMPV4_SRC_ROUTE_FAIL 5 /* Code: Source Route Failed */
+#define ICMPV4_NO_ROUTE_DESTN_8 8 /* Code: No Route to Destn */
+#define ICMPV4_NO_ROUTE_DESTN_11 11 /* Code: No Route to Destn */
+#define ICMPV4_NO_ROUTE_DESTN_12 12 /* Code: No Route to Destn */
+
+#define ICMPV4_ADMIN_PROH_9 9 /* Code: Administratively Prohibited */
+#define ICMPV4_ADMIN_PROH_10 10 /* Code: Administratively Prohibited */
+#define ICMPV4_PREC_CUTOFF 15 /* Code: Precedence Cutoff */
+
+
+#define ICMPV4_TIMEEXCEEDED 11 /* Type: time exceeded */
+#define ICMPV4_TIMTTL 0 /* Code: ttl in transit code */
+
+#define ICMPV4_PARAMETER_PROBLEM 12 /* Type: Parameter Problem */
+#define ICMPV4_PARAM_ERROR 0 /* Code: Pointer to Error */
+#define ICMPV4_MISSING_OPTION_CODE 1 /* Code: Mission option */
+#define ICMPV4_PARAM_BAD_LEN 2 /* Code: Bad Length */
+
+#define ICMPV4_CONVERSION_ERROR 31
+#define ICMPV4_SOURCE_QUENCH 4
+#define ICMPV4_REDIRECT 5
+#define ICMPV4_TIMESTAMP 13
+#define ICMPV4_TIMESTAMP_REPLY 14
+#define ICMPV4_INFO_REQUEST 15
+#define ICMPV4_INFO_REPLY 16
+#define ICMPV4_ADDR_MASK_REQUEST 17
+#define ICMPV4_ADDR_MASK_REPLY 18
+
+typedef struct icmp_v6_s {
+
+ u8 type;
+ u8 code;
+ u16 checksum;
+
+ u32 data[0];
+} icmp_v6_t;
+
+typedef struct pseudo_v6_header {
+ u32 src_addr[4];
+ u32 dst_addr[4];
+ u16 payload_length;
+ u16 next_header;
+} pseudo_v6_header_t;
+
+
+#define ICMPV6_ECHO 128
+#define ICMPV6_ECHO_REPLY 129
+#define ICMPV6_PKT_TOO_BIG 2
+#define ICMPV6_TIMEEXCEEDED 3
+#define ICMPV6_TIMTTL 0
+#define ICMPV6_PARAMETER_PROBLEM 4
+#define ICMPV6_UNREACHABLE 1
+#define ICMPV6_NEIGHBOR_SOLICITAION 135
+#define ICMPV6_NEIGHBOR_ADVT 136
+/* ICMP V6 generated packet size */
+#define ICMPV6_ERR_SIZE 48
+#define ICMPV6_HDR_SIZE (sizeof(icmp_v6_t) +sizeof(u32))
+
+/* Code for Type 1 */
+#define ICMPV6_UNRDESTN 0 /* Code: No route to Desnt */
+#define ICMPV6_ADM_PROH 1 /* Code: Adminitrative Prohibited */
+#define ICMPV6_SRC_ADD_SCOPE 2 /* Code: Source Address beyond scope */
+#define ICMPV6_UNRHOST 3 /* Code: Host Unreachable */
+#define ICMPV6_UNRPORT 4 /* Code: Port UnReachable */
+
+#define ICMPV6_UNRPROT 1 /* type 4 - Code: No route to Desnt */
+
+#define ICMPV6_PTB_CODE 0 /* Code: For PTB */
+#define ICMPV6_PARAM_CODE 0 /* Code: For Parameter Problem */
+#define ICMPV6_UNREC_HDR 1 /* Code: For Parameter Problem */
+#define ICMPV6_SRC_ADD_FAIL 5 /* Code: For Source address failed */
+#define ICMP_ECHO_REPLY_CODE 0
+#define DEFAULT_TTL_HOPLIMIT_VAL 64
+
+typedef struct pptp_hdr_type {
+
+ u16 flags_ver;
+ u16 proto_type; /* PPP = 0x880B */
+ u16 payload_len;
+ u16 call_id;
+ u32 seq_no;
+ u32 ack_no;
+
+} pptp_hdr_type_t;
+
+/*
+ * NAME
+ *
+ * tcp_findoption
+ *
+ * SYNOPSIS
+ * u8* tcp_findoption (tcp_hdr_t *tcp, uchar option)
+ *
+ * PARAMETERS
+ * tcp - pointer to TCP header
+ * option - TCP option
+ *
+ * RETURNS
+ * This function returns a pointer to the option found,
+ * otherwise returns null.
+ *
+ *
+ * DESCRIPTION
+ * This function searches the option and returns a pointer to the
+ * matched option field containing option kind/length/data sub-fields.
+ *
+ */
+static inline u8* tcp_findoption (tcp_hdr_type *tcp, u8 option)
+{
+ u8*data;
+ u8 len, optlen;
+
+ data = tcp->option_data;
+ len = ((tcp->hdr_len>>4) << 2) - sizeof(tcp_hdr_type);
+
+#define MAXTCPOPTIONBYTES 40
+#define MINTCPOPTIONLENGTH 2
+
+ while (len) {
+ if (PREDICT_TRUE(option == data[0])) {
+ return (data);
+ } else {
+ switch (data[0]) {
+ case TCP_END_OPTIONS_LIST:
+ return (NULL);
+ case TCP_OPTION_NOP:
+ len -= 1;
+ data += 1;
+ break;
+ default:
+ /* Sanity check the length. */
+ optlen = data[1];
+ if ((optlen < MINTCPOPTIONLENGTH) ||
+ (optlen > MAXTCPOPTIONBYTES) ||
+ (optlen > len)) {
+ return (NULL);
+ }
+ len -= optlen;
+ data += optlen;
+ break;
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+
+static inline u32 crc_calc (ipv4_header *ipv4)
+{
+ u16 *ipv4_word_ptr = (u16 *) ipv4;
+ u32 crc32;
+ /*
+ * Add all fields except the checksum field
+ */
+ crc32 = (u32)clib_net_to_host_u16(*ipv4_word_ptr) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 1)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 2)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 3)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 4)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 6)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 7)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 8)) +
+ (u32)clib_net_to_host_u16(*(ipv4_word_ptr + 9));
+
+ /* Add in the carry of the original sum */
+ crc32 = (crc32 & 0xFFFF) + (crc32 >> 16);
+ /* Add in the carry of the final sum */
+ crc32 = (crc32 & 0xFFFF) + (crc32 >> 16);
+
+ return crc32;
+}
+
+#endif /* __TCP_HEADER_DEFINITIONS_H__ */
diff --git a/vnet/vnet/vcgn/vcgn_classify.c b/vnet/vnet/vcgn/vcgn_classify.c
new file mode 100644
index 00000000000..518f9102317
--- /dev/null
+++ b/vnet/vnet/vcgn/vcgn_classify.c
@@ -0,0 +1,1419 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/pg/pg.h>
+#include <vppinfra/error.h>
+#include <vppinfra/pool.h>
+
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+
+#include "cnat_db.h"
+#include "cnat_global.h"
+#include "cnat_cli.h"
+#include "cnat_config.h"
+#include "cnat_logging.h"
+#include "cnat_config_api.h"
+#include "cnat_show_api.h"
+#include "cnat_show_response.h"
+#include "cnat_ipv4_udp.h"
+#include "cnat_common_api.h"
+
+#include <arpa/inet.h>
+
+typedef struct {
+ u32 cached_next_index;
+
+ /* inside, outside interface handles */
+ u32 inside_sw_if_index;
+ u32 outside_sw_if_index;
+
+ /* convenience variables */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} vcgn_classify_main_t;
+
+typedef struct {
+ /* $$$$ fill in with per-pkt trace data */
+ u32 next_index;
+ u32 sw_if_index;
+ u32 orig_dst_address;
+ u16 orig_dst_port;
+} vcgn_classify_trace_t;
+
+#define FIND_MY_VRF_USING_I_VRF_ID \
+ my_vrfmap_found = 0; \
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({ \
+ if (my_vrfmap->i_vrf_id == i_vrf_id) { \
+ my_vrfmap_found = 1; \
+ my_vrfmap_temp = my_vrfmap; \
+ break; \
+ } \
+ }));
+
+
+/* packet trace format function */
+static u8 * format_swap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ vcgn_classify_trace_t * t = va_arg (*args, vcgn_classify_trace_t *);
+
+ s = format (s, "VCGN_CLASSIFY: dst %U dst_port %d sw_if_index %d next %d",
+ format_ip4_address, (ip4_header_t *) &t->orig_dst_address,
+ clib_net_to_host_u16(t->orig_dst_port),
+ t->sw_if_index, t->next_index);
+ return s;
+}
+
+vcgn_classify_main_t vcgn_classify_main;
+
+vlib_node_registration_t vcgn_classify_node;
+
+#define foreach_vcgn_classify_error \
+_(PACKETS_RECEIVED, "total packets received") \
+_(V4_PACKETS_PROCESSED, "ipv4 packets processed for vCGN") \
+_(V4_PACKETS_PUNTED, "ipv4 packets punted") \
+_(V6_PACKETS_PUNTED, "ipv6 packets punted") \
+_(MPLS_PACKETS_PUNTED, "mpls unicast packets punted") \
+_(ETH_PACKETS_PUNTED, "ethernet packets punted")
+
+
+typedef enum {
+#define _(sym,str) VCGN_CLASSIFY_ERROR_##sym,
+ foreach_vcgn_classify_error
+#undef _
+ VCGN_CLASSIFY_N_ERROR,
+} vcgn_classify_error_t;
+
+static char * vcgn_classify_error_strings[] = {
+#define _(sym,string) string,
+ foreach_vcgn_classify_error
+#undef _
+};
+
+/*
+ * To drop a pkt and increment one of the previous counters:
+ *
+ * set b0->error = error_node->errors[VCGN_CLASSIFY_ERROR_EXAMPLE];
+ * set next0 to a disposition index bound to "error-drop".
+ *
+ * To manually increment the specific counter VCGN_CLASSIFY_ERROR_EXAMPLE:
+ *
+ * vlib_node_t *n = vlib_get_node (vm, vcgn_classify.index);
+ * u32 node_counter_base_index = n->error_heap_index;
+ * vlib_error_main_t * em = &vm->error_main;
+ * em->counters[node_counter_base_index + VCGN_CLASSIFY_ERROR_EXAMPLE] += 1;
+ *
+ */
+
+typedef enum {
+ VCGN_CLASSIFY_NEXT_IP4_INPUT,
+ VCGN_CLASSIFY_NEXT_IP6_INPUT,
+ VCGN_CLASSIFY_NEXT_MPLS_INPUT,
+ VCGN_CLASSIFY_NEXT_ETHERNET_INPUT,
+ VCGN_CLASSIFY_NEXT_UDP_INSIDE,
+ VCGN_CLASSIFY_NEXT_UDP_OUTSIDE,
+ VCGN_CLASSIFY_NEXT_TCP_INSIDE,
+ VCGN_CLASSIFY_NEXT_TCP_OUTSIDE,
+ VCGN_CLASSIFY_NEXT_ICMP_Q_INSIDE,
+ VCGN_CLASSIFY_NEXT_ICMP_Q_OUTSIDE,
+ VCGN_CLASSIFY_NEXT_ICMP_E_INSIDE,
+ VCGN_CLASSIFY_NEXT_ICMP_E_OUTSIDE,
+ VCGN_CLASSIFY_N_NEXT,
+} vcgn_classify_next_t;
+
+static uword
+vcgn_classify_node_fn (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ u32 n_left_from, * from, * to_next;
+ vcgn_classify_next_t next_index;
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ vlib_node_t *n = vlib_get_node (vm, vcgn_classify_node.index);
+ u32 node_counter_base_index = n->error_heap_index;
+ vlib_error_main_t * em = &vm->error_main;
+ u16 *l3_type;
+ int counter;
+
+ from = vlib_frame_vector_args (frame);
+ n_left_from = frame->n_vectors;
+ next_index = node->cached_next_index;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ #if 0
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ u32 sw_if_index0, sw_if_index1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE);
+ CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE);
+ }
+
+ /* speculatively enqueue b0 and b1 to the current next frame */
+ to_next[0] = bi0 = from[0];
+ to_next[1] = bi1 = from[1];
+ from += 2;
+ to_next += 2;
+ n_left_from -= 2;
+ n_left_to_next -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = vcm->cached_next_index;
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
+ next1 = vcm->cached_next_index;
+
+ /* $$$$ your message in this space. Process 2 x pkts */
+ em->counters[node_counter_base_index + VCGN_CLASSIFY_ERROR_PACKETS_RECEIVED] += 2;
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)))
+ {
+ if (b0->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ vcgn_classify_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ }
+ if (b1->flags & VLIB_BUFFER_IS_TRACED)
+ {
+ vcgn_classify_trace_t *t =
+ vlib_add_trace (vm, node, b1, sizeof (*t));
+ t->sw_if_index = sw_if_index1;
+ t->next_index = next1;
+ }
+ }
+
+ /* verify speculative enqueues, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+ #endif /* if 0 */
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ u32 sw_if_index0;
+ ip4_header_t * h0;
+ //ipv4_header *h0;
+ ethernet_header_t *eth0;
+ icmp_v4_t *icmp;
+ u8 icmp_type;
+ u8 ipv4_hdr_len;
+
+ /* speculatively enqueue b0 to the current next frame */
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ eth0 = (ethernet_header_t *) vlib_buffer_get_current(b0);
+ u16 *etype = &eth0->type;
+
+ /* vlan tag 0x8100 */
+ if (*etype == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) {
+ l3_type = (etype + 1); /* Skip 2 bytes of vlan id */
+ vlib_buffer_advance(b0, 18);
+ } else {
+ l3_type = etype;
+ vlib_buffer_advance(b0, 14);
+ }
+ /* Handling v4 pkts 0x800 */
+ if (*l3_type == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
+
+ h0 = vlib_buffer_get_current (b0);
+
+ u8 protocol_type = h0->protocol;
+
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
+ next0 = VCGN_CLASSIFY_NEXT_IP4_INPUT;
+ counter = VCGN_CLASSIFY_ERROR_V4_PACKETS_PROCESSED;
+
+ if (protocol_type == 0x11) { /* UDP# 17 */
+ next0 = (sw_if_index0 == vcm->inside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_UDP_INSIDE : next0;
+
+ next0 = (sw_if_index0 == vcm->outside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_UDP_OUTSIDE : next0;
+ } else if (protocol_type == 0x06) { /* TCP# 6 */
+ next0 = (sw_if_index0 == vcm->inside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_TCP_INSIDE : next0;
+
+ next0 = (sw_if_index0 == vcm->outside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_TCP_OUTSIDE : next0;
+ } else if (protocol_type == 0x01) { /* ICMP # 1 */
+
+ ipv4_hdr_len = (h0->ip_version_and_header_length & 0xf) << 2;
+ icmp = (icmp_v4_t *)((u8*)h0 + ipv4_hdr_len);
+ icmp_type = icmp->type;
+
+ if ((icmp_type == ICMPV4_ECHO) ||
+ (icmp_type == ICMPV4_ECHOREPLY)) {
+ next0 = (sw_if_index0 == vcm->inside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_ICMP_Q_INSIDE : next0;
+
+ next0 = (sw_if_index0 == vcm->outside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_ICMP_Q_OUTSIDE : next0;
+ } else {
+ next0 = (sw_if_index0 == vcm->inside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_ICMP_E_INSIDE : next0;
+
+ next0 = (sw_if_index0 == vcm->outside_sw_if_index) ?
+ VCGN_CLASSIFY_NEXT_ICMP_E_OUTSIDE : next0;
+
+ }
+ } else {
+ /* cannot do NATting with this L4 protocol */
+ counter = VCGN_CLASSIFY_ERROR_V4_PACKETS_PUNTED;
+ }
+
+ if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE)
+ && (b0->flags & VLIB_BUFFER_IS_TRACED))) {
+ udp_header_t * u0 = (udp_header_t *)(h0+1);
+ vcgn_classify_trace_t *t =
+ vlib_add_trace (vm, node, b0, sizeof (*t));
+ t->sw_if_index = sw_if_index0;
+ t->next_index = next0;
+ t->orig_dst_address = h0->dst_address.as_u32;
+ t->orig_dst_port = u0->dst_port;
+ }
+
+ } else if (*l3_type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) {
+
+ /* IPv6 0x86DD */
+ next0 = VCGN_CLASSIFY_NEXT_IP6_INPUT;
+ counter = VCGN_CLASSIFY_ERROR_V6_PACKETS_PUNTED;
+
+ } else if (*l3_type ==
+ clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
+
+ /* MPLS unicast 0x8847 */
+ next0 = VCGN_CLASSIFY_NEXT_MPLS_INPUT;
+ counter = VCGN_CLASSIFY_ERROR_MPLS_PACKETS_PUNTED;
+ } else { /* Remaining all should be pushed to "ethernet-input" */
+
+ next0 = VCGN_CLASSIFY_NEXT_ETHERNET_INPUT;
+ counter = VCGN_CLASSIFY_ERROR_ETH_PACKETS_PUNTED;
+ }
+
+ em->counters[node_counter_base_index + counter] += 1;
+ em->counters[node_counter_base_index +
+ VCGN_CLASSIFY_ERROR_PACKETS_RECEIVED] += 1;
+
+ /* verify speculative enqueue, maybe switch current next frame */
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ return frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (vcgn_classify_node) = {
+ .function = vcgn_classify_node_fn,
+ .name = "vcgn-classify",
+ .vector_size = sizeof (u32),
+ .format_trace = format_swap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(vcgn_classify_error_strings),
+ .error_strings = vcgn_classify_error_strings,
+
+ .n_next_nodes = VCGN_CLASSIFY_N_NEXT,
+
+ /* edit / add dispositions here */
+ .next_nodes = {
+ [VCGN_CLASSIFY_NEXT_IP4_INPUT] = "ip4-input",
+ [VCGN_CLASSIFY_NEXT_IP6_INPUT] = "ip6-input",
+ [VCGN_CLASSIFY_NEXT_MPLS_INPUT] = "mpls-gre-input",
+ [VCGN_CLASSIFY_NEXT_ETHERNET_INPUT] = "ethernet-input",
+ [VCGN_CLASSIFY_NEXT_UDP_INSIDE] = "vcgn-v4-udp-i2o",
+ [VCGN_CLASSIFY_NEXT_UDP_OUTSIDE] = "vcgn-v4-udp-o2i",
+ [VCGN_CLASSIFY_NEXT_TCP_INSIDE] = "vcgn-v4-tcp-i2o",
+ [VCGN_CLASSIFY_NEXT_TCP_OUTSIDE] = "vcgn-v4-tcp-o2i",
+ [VCGN_CLASSIFY_NEXT_ICMP_Q_INSIDE] = "vcgn-v4-icmp-q-i2o",
+ [VCGN_CLASSIFY_NEXT_ICMP_Q_OUTSIDE] = "vcgn-v4-icmp-q-o2i",
+ [VCGN_CLASSIFY_NEXT_ICMP_E_INSIDE] = "vcgn-v4-icmp-e-i2o",
+ [VCGN_CLASSIFY_NEXT_ICMP_E_OUTSIDE] = "vcgn-v4-icmp-e-o2i"
+ },
+};
+
+
+/* A test function to init the vrf map */
+
+clib_error_t *vcgn_classify_init (vlib_main_t *vm)
+{
+ vcgn_classify_main_t * mp = &vcgn_classify_main;
+
+ mp->vlib_main = vm;
+ mp->vnet_main = vnet_get_main();
+ mp->inside_sw_if_index = 1;
+ mp->outside_sw_if_index = 0;
+
+ dpdk_set_next_node (DPDK_RX_NEXT_IP4_INPUT, "vcgn-classify");
+
+ {
+ pg_node_t * pn;
+ pn = pg_get_node (vcgn_classify_node.index);
+ pn->unformat_edit = unformat_pg_ip4_header;
+ }
+ return 0;
+}
+
+VLIB_INIT_FUNCTION (vcgn_classify_init);
+
+/* Show command handlers */
+static clib_error_t *
+show_vcgn_stats_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ if (cnat_db_init_done) {
+ cnat_nat44_handle_show_stats(vm);
+ } else {
+ vlib_cli_output(vm, "vCGN is not configured !!\n");
+ }
+ return 0;
+}
+
+
+static clib_error_t *
+show_vcgn_config_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ cnat_nat44_handle_show_config(vm);
+ return 0;
+}
+
+static clib_error_t *
+show_vcgn_inside_translation_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ */
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ spp_api_cnat_v4_show_inside_entry_req_t inside_req;
+ u8 *proto;
+ ip4_address_t inside_addr;
+ u32 start_port = 1;
+ u32 end_port = 65535;
+
+ inside_req.start_port = start_port;
+ inside_req.end_port = end_port;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "protocol %s", &proto)) {
+ if (!strncmp((char *) proto, "udp", 3)) {
+ inside_req.protocol = 1;
+ } else if (!strncmp((char *) proto, "tcp", 3)) {
+ inside_req.protocol = 2;
+ } else {
+ inside_req.protocol = 3;
+ }
+ } else if (unformat (input, "inside-addr %U",
+ unformat_ip4_address, &inside_addr)) {
+ inside_req.ipv4_addr = clib_net_to_host_u32(inside_addr.as_u32);
+ } else if (unformat(input, "start-port %u", &start_port)) {
+ inside_req.start_port = start_port;
+ } else if (unformat(input, "end-port %u", &end_port)) {
+ inside_req.end_port = end_port;
+ } else { break;}
+ }
+ inside_req.vrf_id = vcm->inside_sw_if_index;
+ inside_req.flags |= CNAT_TRANSLATION_ENTRY_DYNAMIC; /* as of now only dynamic */
+ inside_req.all_entries = 0; /* we can see it later */
+#if DEBUG
+ vlib_cli_output(vm, "proto %d, inside-addr 0x%x, start_port %u, "
+ "end_port %u, vrf 0x%x\n",
+ inside_req.protocol,
+ inside_req.ipv4_addr,
+ inside_req.start_port,
+ inside_req.end_port,
+ vcm->inside_sw_if_index);
+#endif
+ if (cnat_db_init_done) {
+ cnat_v4_show_inside_entry_req_t_handler(&inside_req, vm);
+ } else {
+ vlib_cli_output(vm, "vCGN is not configured !!\n");
+ }
+ return 0;
+}
+
+
+static clib_error_t *
+show_vcgn_outside_translation_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ void cnat_v4_show_outside_entry_req_t_handler
+ (spp_api_cnat_v4_show_outside_entry_req_t *mp, vlib_main_t *vm);
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ spp_api_cnat_v4_show_outside_entry_req_t outside_req;
+ u8 *proto;
+ ip4_address_t outside_addr;
+ u32 start_port = 1;
+ u32 end_port = 65535;
+
+ outside_req.start_port = start_port;
+ outside_req.end_port = end_port;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "protocol %s", &proto)) {
+ if (!strncmp((char *) proto, "udp", 3)) {
+ outside_req.protocol = 1;
+ } else if (!strncmp((char *) proto, "tcp", 3)) {
+ outside_req.protocol = 2;
+ } else {
+ outside_req.protocol = 3;
+ }
+ } else if (unformat (input, "outside-addr %U",
+ unformat_ip4_address, &outside_addr)) {
+ outside_req.ipv4_addr = clib_net_to_host_u32(outside_addr.as_u32);
+ } else if (unformat(input, "start-port %u", &start_port)) {
+ outside_req.start_port = start_port;
+ } else if (unformat(input, "end-port %u", &end_port)) {
+ outside_req.end_port = end_port;
+ } else { break;}
+ }
+ outside_req.vrf_id = vcm->outside_sw_if_index;
+ outside_req.flags |= CNAT_TRANSLATION_ENTRY_DYNAMIC; /* as of now only dynamic */
+#if DEBUG
+ vlib_cli_output(vm, "proto %d, outside-addr 0x%x, start_port %u, "
+ "end_port %u, vrf 0x%x\n",
+ outside_req.protocol,
+ outside_req.ipv4_addr,
+ outside_req.start_port,
+ outside_req.end_port,
+ vcm->outside_sw_if_index);
+#endif
+ if (cnat_db_init_done) {
+ cnat_v4_show_outside_entry_req_t_handler(&outside_req, vm);
+ } else {
+ vlib_cli_output(vm, "vCGN is not configured !!\n");
+ }
+ return 0;
+}
+
+
+/* Config command handlers */
+static clib_error_t *
+set_vcgn_inside_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ u32 inside_sw_if_index = 1;
+ u32 outside_sw_if_index = ~0;
+ void cnat_db_v2_init (void );
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "%U",
+ unformat_vnet_sw_interface, vnm, &inside_sw_if_index))
+ ;
+ else if (unformat(input, "outside %U",
+ unformat_vnet_sw_interface, vnm, &outside_sw_if_index))
+ ;
+ else break;
+ }
+ if (inside_sw_if_index == ~0 ||
+ outside_sw_if_index == ~0)
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ if (inside_sw_if_index == outside_sw_if_index)
+ return clib_error_return (0, "inside and outside interfaces can't be the same...");
+
+ vcm->inside_sw_if_index = inside_sw_if_index;
+ vcm->outside_sw_if_index = outside_sw_if_index;
+
+ cnat_db_v2_init();
+
+ /* Turn on the db scanner process */
+ cnat_scanner_db_process_turn_on(vm);
+ return 0;
+}
+
+static clib_error_t *
+set_vcgn_map_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ ip4_address_t lo, hi;
+ spp_api_cnat_v4_add_vrf_map_t map;
+ int i;
+
+ vnet_hw_interface_t *inside_hw_if_index = NULL;
+ vnet_hw_interface_t *outside_hw_if_index = NULL;
+
+ if (!unformat (input, "%U", unformat_ip4_address, &lo))
+ return clib_error_return (0, "unknown input `%U'",
+ format_unformat_error, input);
+
+ if (unformat (input, "- %U", unformat_ip4_address, &hi))
+ ;
+
+ /* $$$$ remember to set i_vrf, i_vrf_id as needed */
+
+ /* Fill the structure spp_api_cnat_v4_add_vrf_map_t & let this API handle it */
+ /* i_vrf_id & o_vrf_id are 32-bit & i_vrf, o_vrf are 16 bit */
+ map.i_vrf_id = vcm->inside_sw_if_index;
+ map.o_vrf_id = vcm->outside_sw_if_index;
+ map.i_vrf = vcm->inside_sw_if_index;
+ map.o_vrf = vcm->outside_sw_if_index;
+
+ map.start_addr[0] = clib_net_to_host_u32(lo.as_u32);
+ map.end_addr[0] = clib_net_to_host_u32(hi.as_u32);
+
+ for (i = 0; i < CNAT_MAX_VRFMAP_ENTRIES; i++) {
+ vrf_map_array[i] = VRF_MAP_ENTRY_EMPTY;
+ }
+ cnat_nat44_add_vrf_map_t_handler(&map, vm);
+
+#if 1
+ inside_hw_if_index = vnet_get_sup_hw_interface(vcm->vnet_main, vcm->inside_sw_if_index);
+ if (inside_hw_if_index) {
+ vnet_hw_interface_rx_redirect_to_node(vcm->vnet_main,
+ inside_hw_if_index->hw_if_index, vcgn_classify_node.index);
+ }
+ outside_hw_if_index = vnet_get_sup_hw_interface(vcm->vnet_main, vcm->outside_sw_if_index);
+ if (outside_hw_if_index) {
+ vnet_hw_interface_rx_redirect_to_node(vcm->vnet_main,
+ outside_hw_if_index->hw_if_index, vcgn_classify_node.index);
+ }
+#endif
+ return 0;
+}
+
+static clib_error_t *
+set_vcgn_tcp_timeout_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u32 act_timeout = 0;
+ u32 init_timeout = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "active %u", &act_timeout))
+ tcp_active_timeout = act_timeout;
+ else if (unformat(input, "init %u", &init_timeout))
+ tcp_initial_setup_timeout = init_timeout;
+ else break;
+ }
+ return 0;
+}
+
+static clib_error_t *
+set_vcgn_udp_timeout_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u32 act_timeout = 0;
+ u32 init_timeout = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "active %u", &act_timeout))
+ udp_act_session_timeout = act_timeout;
+ else if (unformat(input, "init %u", &init_timeout))
+ udp_init_session_timeout = init_timeout;
+ else break;
+ }
+ return 0;
+}
+
+
+static clib_error_t *
+set_vcgn_icmp_timeout_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ * vnet_main_t * vnm = vnet_get_main();
+ * vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u32 timeout = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "%u", &timeout))
+ ;
+ else break;
+ }
+ icmp_session_timeout = timeout;
+ return 0;
+}
+
+
+static clib_error_t *
+set_vcgn_protocol_default_timeout_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u8 *protocol;
+ u8 reset = 1;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "%s", &protocol))
+ ;
+ else break;
+ }
+ cnat_nat44_set_protocol_timeout_value(0, 0, protocol, reset, vm);
+ return 0;
+}
+
+static clib_error_t *
+set_vcgn_dynamic_port_start_range_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u32 port = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "%u", &port))
+ ;
+ else break;
+ }
+ if (port != 0 && port > 65535) {
+ vlib_cli_output(vm, "Error !! Invalid port\n");
+ } else {
+ cnat_static_port_range = port;
+ vlib_cli_output(vm, "Dynamic Port Range Config Successful !!\n");
+ }
+ return 0;
+}
+
+static clib_error_t *
+set_vcgn_port_limit_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ /*
+ vnet_main_t * vnm = vnet_get_main();
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ */
+ u32 port = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat(input, "%u", &port))
+ ;
+ else break;
+ }
+ if (port != 0 && port > 65535) {
+ vlib_cli_output(vm, "Error !! Invalid port\n");
+ } else {
+ cnat_main_db_max_ports_per_user = port;
+ vlib_cli_output(vm, "Port Limit Config Successful !!\n");
+ }
+ return 0;
+}
+
+static inline void nfv9_init_pkt_sent_data(cnat_nfv9_logging_info_t *nfv9_info)
+{
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ nfv9_info->server_index;
+
+ /*
+ * Reset the pkts_since_last_template and sent_time
+ * so that template will be sent next time
+ */
+ server->last_template_sent_time = 0;
+ server->pkts_since_last_template = 0xffffffff;
+}
+
+static inline u16 nfv9_get_max_length_minus_max_record_size(u16 path_mtu)
+{
+ u16 max_length_minus_max_record_size;
+ if(!path_mtu) /* Use default */
+ path_mtu = NFV9_DEF_PATH_MTU;
+
+ max_length_minus_max_record_size = path_mtu -
+ CNAT_NFV9_DATAFLOW_RECORD_HEADER_LENGTH -
+ NFV9_PAD_VALUE -
+ CNAT_NFV9_MAX_SINGLE_RECORD_LENGTH; /* Note.. as of now this record
+ * requires max number of bytes. If you add more records,
+ * this needs to be re-checked */
+ if (max_length_minus_max_record_size < CNAT_NFV9_MIN_RECORD_SIZE) {
+ printf(
+ "Resetting max_length_minus_max_record_size from %d to %ld\n",
+ max_length_minus_max_record_size,
+ CNAT_NFV9_MIN_RECORD_SIZE);
+
+ max_length_minus_max_record_size = CNAT_NFV9_MIN_RECORD_SIZE;
+ }
+ return max_length_minus_max_record_size;
+}
+
+/* This function finds if the netflow server indicated by
+ * new_server_info is already configured for some other instance
+ * if yes, it returns the same pointer so that, info sent to the
+ * server is consistent. If the server is not found, a new instance
+ * is created and returned. If an existing server is used, its refernce
+ * count is incrimented (indicating the number of instances using the
+ * same server
+ */
+ /* #define DEBUG_NF_SERVER_CONFIG 1 */
+static u16 nfv9_get_server_instance(
+ cnat_nfv9_logging_info_t *nfv9_info, nfv9_server_info_t *new_server_info)
+{
+
+ /* Check if the instance has a server already and if yes, does it match */
+ nfv9_server_info_t *server;
+ if(nfv9_info->server_index != EMPTY) {
+ server = nfv9_server_info_pool + nfv9_info->server_index;
+
+ if((server->ipv4_address == new_server_info->ipv4_address) &&
+ (server->port == new_server_info->port)) {
+ /* Same server.. just check if refresh rate/timeouts are reduced */
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("\n Server match for %x and port %d\n",
+ new_server_info->ipv4_address, new_server_info->port);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ goto adjust_refresh_rate;
+ } else { /* The server is being changed */
+ server->ref_count--;
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("\n Server change from %x, %d to %x, %d"
+ "Ref count %d\n",
+ server->ipv4_address,
+ server->port,
+ new_server_info->ipv4_address, new_server_info->port,
+ server->ref_count);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ if(!server->ref_count) {
+ /* Return this server to pool */
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ PLATFORM_DEBUG_PRINT("Deleting Server %x, %d at %d\n",
+ server->ipv4_address,
+ server->port,
+ nfv9_info->server_index);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ pool_put(nfv9_server_info_pool, server);
+ }
+ }
+ }
+
+ /* Now check if the server is already present in the pool */
+ u8 found = 0;
+ server = 0;
+ pool_foreach (server, nfv9_server_info_pool, ({
+ if ((server->ipv4_address == new_server_info->ipv4_address) &&
+ (server->port == new_server_info->port)) {
+ server->ref_count++;
+ nfv9_info->server_index = server - nfv9_server_info_pool;
+ found = 1;
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("Re-using server %x, %d Ref count %d\n",
+ server->ipv4_address, server->port, server->ref_count);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ break;
+ }
+ }));
+
+ if(!found) {
+ /* Create a new one, initialize and return */
+ server = 0;
+ pool_get(nfv9_server_info_pool, server);
+ memcpy(server, new_server_info, sizeof(nfv9_server_info_t));
+ server->ref_count = 1;
+ nfv9_info->server_index = server - nfv9_server_info_pool;
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("Create new server for at %d %x and port %d\n",
+ nfv9_info->server_index,
+ new_server_info->ipv4_address, new_server_info->port);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ return CNAT_SUCCESS;
+ }
+
+adjust_refresh_rate:
+ if(server->refresh_rate >
+ new_server_info->refresh_rate) {
+ server->refresh_rate =
+ new_server_info->refresh_rate;
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("Reset refresh rate to %d\n",
+ server->refresh_rate);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ }
+
+ if(server->timeout_rate >
+ new_server_info->timeout_rate) {
+ server->timeout_rate =
+ new_server_info->timeout_rate;
+#ifdef DEBUG_NF_SERVER_CONFIG
+ if(my_instance_number == 1) {
+ printf("Reset timeout rate to %d\n",
+ server->timeout_rate);
+ }
+#endif /* #ifdef DEBUG_NF_SERVER_CONFIG */
+ }
+
+ return CNAT_SUCCESS;
+}
+static clib_error_t *
+set_vcgn_nfv9_logging_cofig_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vcgn_classify_main_t * vcm = &vcgn_classify_main;
+ spp_api_cnat_v4_config_nfv9_logging_t nfv9_conf;
+ ip4_address_t server_addr;
+ u32 ip_addr = 0;
+ u32 port;
+ u32 refresh_rate = 0;
+ u32 timeout = 0;
+ u32 pmtu = 0;
+ u8 enable = 1;
+/* vcgn changes start*/
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info = NULL;
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info_tmp = NULL;
+ cnat_vrfmap_t *my_vrfmap = 0, *my_vrfmap_temp;
+ u16 i_vrf;
+ u32 i_vrf_id;
+ u8 found;
+ u8 found_vrf;
+ /*
+ * Init NFv9 logging info as needed, this will be done only once
+ */
+ cnat_nfv9_logging_init();
+
+ i_vrf = vcm->inside_sw_if_index;
+ i_vrf_id = vcm->inside_sw_if_index;
+
+ found = 0;
+
+/* vcgn changes end*/
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (input, "server %U", unformat_ip4_address, &server_addr))
+ ip_addr = clib_net_to_host_u32(server_addr.as_u32);
+ else if (unformat(input, "port %u", &port))
+ ;
+ else if (unformat(input, "refresh-rate %u", &refresh_rate))
+ ;
+ else if (unformat(input, "timeout %u", &timeout))
+ ;
+ else if (unformat(input, "pmtu %u", &pmtu))
+ ;
+ else if (unformat(input, "del"))
+ enable = 0;
+ else break;
+ }
+
+
+ #if 0
+ vlib_cli_output(vm, "ip 0x%x, port %u, refresh %u, "
+ "timeout %u, pmtu %u enable %u\n",
+ ip_addr, port, refresh_rate,
+ timeout, pmtu, enable);
+ #endif
+ nfv9_conf.enable = enable;
+ nfv9_conf.ipv4_address = ip_addr;
+ nfv9_conf.i_vrf_id = vcm->inside_sw_if_index;
+ nfv9_conf.i_vrf = vcm->inside_sw_if_index;
+ nfv9_conf.port = port;
+ nfv9_conf.refresh_rate = refresh_rate;
+ nfv9_conf.timeout_rate = timeout;
+ nfv9_conf.path_mtu = pmtu;
+ nfv9_conf.nfv9_global_collector = 0;
+ nfv9_conf.session_logging = 0;
+
+ /*
+ * At this point the NFv9 global information should already be
+ * inited as we have called cnat_nfv9_logging_init()
+ */
+
+ if (nfv9_conf.nfv9_global_collector) {
+ if (cnat_nfv9_global_info.cnat_nfv9_global_collector_index != EMPTY) {
+ found = 1;
+ my_nfv9_logging_info = cnat_nfv9_logging_info_pool +
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+ }
+ } else {
+ /* Do we already have a map for this VRF? */
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ if (my_nfv9_logging_info->i_vrf_id == i_vrf_id) {
+ found_vrf = 1;
+ printf("found_vrf %d\n", found_vrf);
+ nfv9_server_info_t *server = nfv9_server_info_pool +
+ my_nfv9_logging_info->server_index;
+ printf("server ip4 0x%x port %d\n", server->ipv4_address, server->port);
+ printf("nfv9_conf v4 0x%x port %d\n", nfv9_conf.ipv4_address, nfv9_conf.port);
+ if((server->ipv4_address == (nfv9_conf.ipv4_address)) && (server->port == (nfv9_conf.port))) {
+ found = 1;
+ my_nfv9_logging_info_tmp = my_nfv9_logging_info;
+ printf("found %d\n", found);
+ break;
+ }
+ }
+ }));
+ }
+
+ if ((nfv9_conf.ipv4_address == 0) ||
+ (nfv9_conf.port == 0)) {
+ vlib_cli_output(vm,
+ "Add NFv9 ivrf %d Logging Invalid values [IPv4 0x%x, PORT %d]\n",
+ i_vrf,
+ (nfv9_conf.ipv4_address),
+ (nfv9_conf.port));
+ goto done;
+ }
+
+ if (nfv9_conf.enable) {
+ if ((nfv9_conf.ipv4_address == 0) ||
+ (nfv9_conf.port == 0)) {
+ nfv9_conf.rc = CNAT_ERR_PARSER;
+ vlib_cli_output(vm,
+ "NFV9_logging i_vrf %d, Invalid [v4_addr 0x%x port %d]\n",
+ i_vrf,
+ (nfv9_conf.ipv4_address),
+ (nfv9_conf.port));
+ goto done;
+ }
+
+ nfv9_server_info_t new_server_info;
+ memset(&new_server_info, 0, sizeof(nfv9_server_info_t));
+ new_server_info.ipv4_address =
+ nfv9_conf.ipv4_address;
+ new_server_info.port =
+ (nfv9_conf.port);
+ new_server_info.refresh_rate =
+ (nfv9_conf.refresh_rate);
+ /*
+ * Store the timeout in seconds. User configures it in minutes
+ */
+ new_server_info.timeout_rate =
+ 60*(nfv9_conf.timeout_rate);
+ if (found && my_nfv9_logging_info) {
+ /*
+ * Entry already present, change it
+ */
+ my_nfv9_logging_info->max_length_minus_max_record_size =
+ nfv9_get_max_length_minus_max_record_size(
+ ((nfv9_conf.path_mtu)));
+ } else {
+ pool_get(cnat_nfv9_logging_info_pool, my_nfv9_logging_info);
+ memset(my_nfv9_logging_info, 0, sizeof(*my_nfv9_logging_info));
+ my_nfv9_logging_info->server_index = EMPTY;
+ my_nfv9_logging_info->nfv9_logging_next_index = EMPTY;
+ /*
+ * Make the current and head logging context indeices as EMPTY.
+ * When first logging happens, these get set correctly
+ */
+ my_nfv9_logging_info->current_logging_context = NULL;
+ my_nfv9_logging_info->queued_logging_context = NULL;
+#if 0
+ my_nfv9_logging_info->f = NULL;
+ my_nfv9_logging_info->to_next = NULL;
+ output_node = vlib_get_node_by_name (vm, (u8 *) "ip4-input");
+ my_nfv9_logging_info->ip4_input_node_index = output_node->index;
+ printf("ip4_input_node_index %d\n", my_nfv9_logging_info->ip4_input_node_index);
+#endif
+ my_nfv9_logging_info->i_vrf = i_vrf;
+ my_nfv9_logging_info->i_vrf_id = i_vrf_id;
+ my_nfv9_logging_info->max_length_minus_max_record_size =
+ nfv9_get_max_length_minus_max_record_size(
+ nfv9_conf.path_mtu);
+
+ /* my_nfv9_logging_info will have a copy of logging_policy
+ * because, it is quite possible that nfv9 config arrives before
+ * the corresponding vrfmap is initialized. In such cases
+ * this copy will be used to update the vrfmap entry
+ */
+ my_nfv9_logging_info->logging_policy = nfv9_conf.session_logging;
+
+ if (nfv9_conf.nfv9_global_collector) {
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index =
+ my_nfv9_logging_info - cnat_nfv9_logging_info_pool;
+
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->nfv9_logging_index == EMPTY) {
+ my_vrfmap->nfv9_logging_index =
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+ }
+ }));
+ } else {
+ u32 my_vrfmap_found = 0;
+
+ FIND_MY_VRF_USING_I_VRF_ID
+ my_vrfmap = my_vrfmap_temp;
+ if (my_vrfmap_found) {
+ if(my_vrfmap->nfv9_logging_index == EMPTY) {
+ my_vrfmap->nfv9_logging_index =
+ my_nfv9_logging_info - cnat_nfv9_logging_info_pool;
+ // my_vrfmap->nf_logging_policy = mp->session_logging;
+ } else {
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info_temp = cnat_nfv9_logging_info_pool + my_vrfmap->nfv9_logging_index;
+ while(my_nfv9_logging_info_temp->nfv9_logging_next_index != EMPTY){
+ my_nfv9_logging_info_temp = cnat_nfv9_logging_info_pool + my_nfv9_logging_info_temp->nfv9_logging_next_index;
+ }
+ my_nfv9_logging_info_temp->nfv9_logging_next_index = my_nfv9_logging_info - cnat_nfv9_logging_info_pool;
+ }
+ }
+ }
+ }
+
+ /* Update logging policy */
+ my_nfv9_logging_info->logging_policy = nfv9_conf.session_logging;
+ if (nfv9_conf.nfv9_global_collector) {
+ if(PLATFORM_DBL_SUPPORT) {
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->nfv9_logging_index ==
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index) {
+ my_vrfmap->nf_logging_policy = nfv9_conf.session_logging;
+ }
+ }));
+ } else {
+ nfv9_conf.rc = CNAT_ERR_NO_SESSION_DB;
+ }
+ } else {
+ if(PLATFORM_DBL_SUPPORT) {
+ u32 my_vrfmap_found = 0;
+ my_vrfmap_temp = NULL;
+ FIND_MY_VRF_USING_I_VRF_ID
+ my_vrfmap = my_vrfmap_temp;
+ if (my_vrfmap_found) {
+ // my_vrfmap->nf_logging_policy = mp->session_logging;
+ }
+ } else {
+ nfv9_conf.rc = CNAT_ERR_NO_SESSION_DB;
+ }
+ }
+ u8 nfv9_logging_policy = 0;
+ u32 my_vrfmap_found = 0;
+ my_vrfmap_temp = NULL;
+ FIND_MY_VRF_USING_I_VRF_ID
+ my_vrfmap = my_vrfmap_temp;
+ if (my_vrfmap_found) {
+ u32 index_curr = my_vrfmap->nfv9_logging_index;
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info_temp;
+ while(index_curr != EMPTY) {
+ my_nfv9_logging_info_temp = cnat_nfv9_logging_info_pool + index_curr;
+ nfv9_logging_policy = nfv9_logging_policy || my_nfv9_logging_info_temp->logging_policy;
+ index_curr = (cnat_nfv9_logging_info_pool + index_curr)->nfv9_logging_next_index;
+ }
+ my_vrfmap->nf_logging_policy = nfv9_logging_policy;
+ }
+ //vlib_cli_output(vm,"Netflow logging policy = %d\n", my_vrfmap->nf_logging_policy);
+ if(nfv9_get_server_instance(my_nfv9_logging_info, &new_server_info)
+ != CNAT_SUCCESS) {
+ vlib_cli_output(vm, "Error to get server instance");
+ nfv9_conf.rc = CNAT_ERR_PARSER;
+ goto done;
+ }
+ nfv9_init_pkt_sent_data(my_nfv9_logging_info);
+
+ vlib_cli_output(vm,"Adding NFv9 Logging Succeeded\n");
+ nfv9_configured = 1;
+
+ } else {
+ /*Delete path*/
+ if (found) {
+ /* if found entry then we need to overwrite the my_nfv9_logging_info_tmp
+ * to my_nfv9_logging_info
+ */
+ my_nfv9_logging_info = my_nfv9_logging_info_tmp;
+ if (i_vrf == INVALID_UIDX) {
+ /*
+ * We are deleting a global collector. Mark the collectors
+ * in those VRFs using the global collector
+ */
+ pool_foreach (my_vrfmap, cnat_map_by_vrf, ({
+ if (my_vrfmap->nfv9_logging_index ==
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index) {
+ my_vrfmap->nfv9_logging_index = EMPTY;
+ }
+ }));
+
+ cnat_nfv9_global_info.cnat_nfv9_global_collector_index = EMPTY;
+ } else {
+ u32 my_vrfmap_found = 0;
+ my_vrfmap_temp = NULL;
+ FIND_MY_VRF_USING_I_VRF_ID
+ my_vrfmap = my_vrfmap_temp;
+ if (my_vrfmap_found) {
+ // my_vrfmap->nfv9_logging_index = cnat_nfv9_global_info.cnat_nfv9_global_collector_index;
+ }
+ }
+ if (my_nfv9_logging_info->queued_logging_context ||
+ my_nfv9_logging_info->current_logging_context) {
+ /*
+ * If there is a pending context:
+ * Set the deleted flag to 1. This will ensure
+ * that the logging info structure gets freed after any
+ * pending packet get sent
+ */
+ my_nfv9_logging_info->deleted = 1;
+ } else {
+ /*
+ * No pending context, just free the logging info structure
+ */
+ u32 index = my_nfv9_logging_info - cnat_nfv9_logging_info_pool;
+ if(index == my_vrfmap->nfv9_logging_index) {
+ /* Deleting the first sever */
+ my_vrfmap->nfv9_logging_index = my_nfv9_logging_info->nfv9_logging_next_index;
+ /* if(my_nfv9_logging_info->nfv9_logging_next_index != EMPTY){
+ my_vrfmap->nf_logging_policy = (cnat_nfv9_logging_info_pool + my_nfv9_logging_info->nfv9_logging_next_index)->logging_policy;
+ } else {
+ my_vrfmap->nf_logging_policy = EMPTY;
+ }*/
+ } else {
+ u32 index_curr = my_vrfmap->nfv9_logging_index;
+ u32 index_prev = EMPTY;
+ while(index_curr != EMPTY) {
+ index_prev = index_curr;
+ index_curr = (cnat_nfv9_logging_info_pool + index_curr)->nfv9_logging_next_index;
+ if(index == index_curr)
+ {
+ (cnat_nfv9_logging_info_pool + index_prev)->nfv9_logging_next_index = (cnat_nfv9_logging_info_pool + index_curr)->nfv9_logging_next_index;
+ break;
+ }
+ }
+ }
+ nfv9_delete_server_info(my_nfv9_logging_info);
+ pool_put(cnat_nfv9_logging_info_pool, my_nfv9_logging_info);
+ }
+
+ vlib_cli_output(vm, "Deleting NFv9 Logging Succeeded\n");
+ /*
+ * Search across all vrf and check if nfv9 logging is configured.
+ */
+ nfv9_configured = 0;
+ pool_foreach (my_nfv9_logging_info, cnat_nfv9_logging_info_pool, ({
+ nfv9_configured = 1;
+ break;
+ }));
+ } else {
+ nfv9_conf.rc = CNAT_NO_CONFIG;
+ vlib_cli_output(vm, "Add NFv9 Logging Failed (2) Non Existent vrf %d\n",
+ i_vrf);
+
+ }
+ u8 nfv9_logging_policy = 0;
+ u32 my_vrfmap_found = 0;
+ my_vrfmap_temp = NULL;
+ FIND_MY_VRF_USING_I_VRF_ID
+ my_vrfmap = my_vrfmap_temp;
+ if (my_vrfmap_found) {
+ u32 index_curr = my_vrfmap->nfv9_logging_index;
+ cnat_nfv9_logging_info_t *my_nfv9_logging_info_temp;
+ while(index_curr != EMPTY) {
+ my_nfv9_logging_info_temp = cnat_nfv9_logging_info_pool + index_curr;
+ nfv9_logging_policy = nfv9_logging_policy || my_nfv9_logging_info_temp->logging_policy;
+ index_curr = (cnat_nfv9_logging_info_pool + index_curr)->nfv9_logging_next_index;
+ }
+ my_vrfmap->nf_logging_policy = nfv9_logging_policy;
+ }
+ printf("After deleting the netflow server,Netflow logging policy = %d\n", my_vrfmap->nf_logging_policy);
+ }
+
+done:
+ return 0;
+}
+
+/* config CLIs */
+VLIB_CLI_COMMAND (set_vcgn_map_command) = {
+ .path = "set vcgn map",
+ .short_help = "set vcgn map <lo-address> [- <hi-address>]",
+ .function = set_vcgn_map_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_inside_command) = {
+ .path = "set vcgn inside",
+ .short_help = "set vcgn inside <inside intfc> outside <outside intfc>",
+ .function = set_vcgn_inside_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_tcp_timeout_command) = {
+ .path = "set vcgn tcp timeout",
+ .short_help = "set vcgn tcp timeout active <1-65535> init <1-65535>",
+ .function = set_vcgn_tcp_timeout_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_udp_timeout_command) = {
+ .path = "set vcgn udp timeout",
+ .short_help = "set vcgn udp timeout active <1-65535> init <1-65535>",
+ .function = set_vcgn_udp_timeout_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_icmp_timeout_command) = {
+ .path = "set vcgn icmp timeout",
+ .short_help = "set vcgn icmp timeout <1-65535>",
+ .function = set_vcgn_icmp_timeout_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_protocol_default_timeout_command) = {
+ .path = "set vcgn default timeout",
+ .short_help = "set vcgn default timeout protocol <tcp/udp/icmp>",
+ .function = set_vcgn_protocol_default_timeout_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_dynamic_port_start_range_command) = {
+ .path = "set vcgn dynamic port start",
+ .short_help = "set vcgn dynamic port start <1-65535>",
+ .function = set_vcgn_dynamic_port_start_range_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_port_limit_command) = {
+ .path = "set vcgn port limit",
+ .short_help = "set vcgn port limit <1-65535>",
+ .function = set_vcgn_port_limit_command_fn,
+};
+
+VLIB_CLI_COMMAND (set_vcgn_nfv9_logging_cofig_command) = {
+ .path = "set vcgn nfv9",
+ .short_help = "set vcgn nfv9 [del] server <ip-addr> port <port> [refresh-rate <n>] [timeout <n>] [pmtu <n>]",
+ .function = set_vcgn_nfv9_logging_cofig_command_fn,
+};
+
+
+/* show CLIs */
+VLIB_CLI_COMMAND (show_vcgn_config_command) = {
+ .path = "show vcgn config",
+ .short_help = "show vcgn config",
+ .function = show_vcgn_config_command_fn,
+};
+
+VLIB_CLI_COMMAND (show_vcgn_stat_command) = {
+ .path = "show vcgn statistics",
+ .short_help = "show vcgn statistics",
+ .function = show_vcgn_stats_command_fn,
+};
+
+VLIB_CLI_COMMAND (show_vcgn_inside_translation_command) = {
+ .path = "show vcgn inside-translation",
+ .short_help = "show vcgn inside-translation protocol <tcp/udp/icmp> "
+ "inside-addr <ip-addr> [start-port <n>] [end-port <n>]",
+ .function = show_vcgn_inside_translation_command_fn,
+};
+
+VLIB_CLI_COMMAND (show_vcgn_outside_translation_command) = {
+ .path = "show vcgn outside-translation",
+ .short_help = "show vcgn outside-translation protocol <tcp/udp/icmp> "
+ "outside-addr <ip-addr> [start-port <n>] [end-port <n>]",
+ .function = show_vcgn_outside_translation_command_fn,
+};
+
+static clib_error_t *
+vcgn_init (vlib_main_t * vm)
+{
+ clib_error_t * error = 0;
+
+ if ((error = vlib_call_init_function
+ (vm, vcgn_classify_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_udp_inside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_udp_outside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_udp_inside_input_exc_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_db_scanner_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_tcp_inside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_tcp_inside_input_exc_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_tcp_outside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_icmp_q_inside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_icmp_q_inside_input_exc_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_icmp_q_outside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_icmp_e_inside_input_init)))
+ return error;
+ if ((error = vlib_call_init_function
+ (vm, cnat_ipv4_icmp_e_outside_input_init)))
+ return error;
+
+ return error;
+}
+
+VLIB_INIT_FUNCTION (vcgn_init);
diff --git a/vnet/vnet/vcgn/vcgn_db.h b/vnet/vnet/vcgn/vcgn_db.h
new file mode 100644
index 00000000000..cd7d835cba1
--- /dev/null
+++ b/vnet/vnet/vcgn/vcgn_db.h
@@ -0,0 +1,117 @@
+/*
+ *------------------------------------------------------------------
+ * vcgn_db.h - translation database definitions
+ *
+ * Copyright (c) 2007-2014 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __VCGN_DB_H__
+#define __VCGN_DB_H__
+
+#include "index_list.h"
+
+/*
+ * The key structure. All fields are in NETWORK byte order!
+ */
+typedef struct {
+ u32 ipv4;
+ u16 port;
+ u16 vrf; //bit0-12:vrf, bit13:unused, bit14-15:protocol
+} cnat_db_key_t;
+
+/* bit14-15:protocol in cnat_db_key_t */
+#define CNAT_INVALID_PROTO 0x0000
+#define CNAT_UDP 0x4000
+#define CNAT_TCP 0x8000
+#define CNAT_ICMP 0xc000
+#define CNAT_VRF_MASK 0x3fff
+#define CNAT_PRO_MASK 0xc000
+#define CNAT_PRO_SHIFT 14
+
+/*
+ * Maximum number of VRF entries supported
+ */
+#define CNAT_MAX_VRFMAP_ENTRIES (CNAT_VRF_MASK + 1)
+/*
+ * for hashing purposes, fetch the key in one instr.
+ */
+typedef union {
+ cnat_db_key_t k;
+ u64 key64;
+} cnat_key_t;
+
+/*
+ * Main translation database entries. Currently 0x50 = 80 bytes in length.
+ * Given 20,000,000 entries, it saves nearly 1gb of SDRAM to pack the entries
+ * and pay the extra prefetch. So, that's what we do.
+ */
+
+typedef struct {
+ /* 0x00 */
+ index_slist_t out2in_hash; /* hash-and-chain, x2 */
+ index_slist_t in2out_hash;
+
+ /* 0x08 */
+ cnat_key_t out2in_key; /* network-to-user, outside-to-inside key */
+
+ /* 0x10 */
+ cnat_key_t in2out_key; /* user-to-network, inside-to-outside key */
+
+ /* 0x18 */
+ index_dlist_t user_ports; /* per-user translation list */
+
+ /* 0x20 */
+ u32 user_index; /* index of user that owns this entry */
+
+ /* 0x24 */
+ u16 vrfmap_index; /* index of vrfmap */
+
+ /* 0x26 */
+ u16 flags; /* Always need flags... */
+#define CNAT_DB_FLAG_PORT_PAIR (1<<0)
+#define CNAT_DB_FLAG_TCP_ACTIVE (1<<1)
+#define CNAT_DB_FLAG_ENTRY_FREE (1<<2)
+#define CNAT_DB_FLAG_UDP_ACTIVE (1<<3)
+#define CNAT_DB_FLAG_STATIC_PORT (1<<4)
+#define CNAT_DB_FLAG_ALG_ENTRY (1<<5)
+
+ /* 0x28 */
+ u32 dst_ipv4; /* pointer to ipv4 dst list, used in evil mode */
+
+ /* 0x2C */
+ u32 out2in_pkts; /* pkt counters */
+
+ /* 0x30 */
+ u32 in2out_pkts;
+
+ /* 0x34 */
+ u32 entry_expires; /* timestamp used to expire translations */
+
+ /* 0x38 */
+ union { /* used by FTP ALG, pkt len delta due to FTP PORT cmd */
+ u16 delta;
+ i8 alg_dlt[2]; /* two delta values, 0 for previous, 1 for current */
+ u16 il; /* Used to indicate if interleaved mode is used
+ in case of RTSP ALG */
+ } alg;
+
+ /* 0x 48 */
+ u32 tcp_seq_num; /* last tcp (FTP) seq # that has pkt len change due to PORT */
+
+ cnat_timeout_t destn_key;
+
+ /* 0x4C... last byte -- 72 total */
+} cnat_main_db_entry_t;
+#endif
diff --git a/vnet/vnet/vnet.h b/vnet/vnet/vnet.h
new file mode 100644
index 00000000000..2378c2420b8
--- /dev/null
+++ b/vnet/vnet/vnet.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * vnet.h: general networking definitions
+ *
+ * Copyright (c) 2011 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vnet_vnet_h
+#define included_vnet_vnet_h
+
+#include <vppinfra/types.h>
+
+typedef enum {
+ VNET_UNICAST,
+ VNET_MULTICAST,
+ VNET_N_CAST,
+} vnet_cast_t;
+
+#include <vnet/unix/pcap.h>
+#include <vnet/buffer.h>
+#include <vnet/config.h>
+#include <vnet/interface.h>
+#include <vnet/rewrite.h>
+#include <vnet/api_errno.h>
+
+typedef struct vnet_main_t {
+ u32 local_interface_hw_if_index;
+ u32 local_interface_sw_if_index;
+
+ vnet_interface_main_t interface_main;
+
+ /* set up by constructors */
+ vnet_device_class_t * device_class_registrations;
+ vnet_hw_interface_class_t * hw_interface_class_registrations;
+ _vnet_interface_function_list_elt_t * hw_interface_add_del_functions;
+ _vnet_interface_function_list_elt_t * hw_interface_link_up_down_functions;
+ _vnet_interface_function_list_elt_t * sw_interface_add_del_functions;
+ _vnet_interface_function_list_elt_t * sw_interface_admin_up_down_functions;
+
+ /*
+ * Last "api" error, preserved so we can issue reasonable diagnostics
+ * at or near the top of the food chain
+ */
+ vnet_api_error_t api_errno;
+
+ vlib_main_t * vlib_main;
+} vnet_main_t;
+
+vnet_main_t vnet_main;
+vnet_main_t **vnet_mains;
+
+#include <vnet/interface_funcs.h>
+#include <vnet/global_funcs.h>
+
+#if DPDK > 0
+#include <vnet/devices/dpdk/threads.h>
+#include <vnet/dpdk_replication.h>
+#endif
+
+#endif /* included_vnet_vnet_h */
diff --git a/vnet/vnet/vxlan/decap.c b/vnet/vnet/vxlan/decap.c
new file mode 100644
index 00000000000..7789bed9310
--- /dev/null
+++ b/vnet/vnet/vxlan/decap.c
@@ -0,0 +1,429 @@
+/*
+ * decap.c: vxlan tunnel decap packet processing
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vnet/pg/pg.h>
+#include <vnet/vxlan/vxlan.h>
+
+vlib_node_registration_t vxlan_input_node;
+
+typedef struct {
+ u32 next_index;
+ u32 tunnel_index;
+ u32 error;
+ u32 vni;
+} vxlan_rx_trace_t;
+
+static u8 * format_vxlan_rx_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ vxlan_rx_trace_t * t = va_arg (*args, vxlan_rx_trace_t *);
+
+ if (t->tunnel_index != ~0)
+ {
+ s = format (s, "VXLAN: tunnel %d vni %d next %d error %d",
+ t->tunnel_index, t->vni, t->next_index, t->error);
+ }
+ else
+ {
+ s = format (s, "VXLAN: no tunnel for vni %d next %d error %d",
+ t->vni, t->next_index, t->error);
+ }
+ return s;
+}
+
+static uword
+vxlan_input (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ vxlan_main_t * vxm = &vxlan_main;
+ vnet_main_t * vnm = vxm->vnet_main;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u32 last_tunnel_index = ~0;
+ vxlan_tunnel_key_t last_key;
+ u32 pkts_decapsulated = 0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
+
+ last_key.as_u64 = ~0;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+ stats_sw_if_index = node->runtime_data[0];
+ stats_n_packets = stats_n_bytes = 0;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 next0, next1;
+ ip4_header_t * ip0, * ip1;
+ vxlan_header_t * vxlan0, * vxlan1;
+ uword * p0, * p1;
+ u32 tunnel_index0, tunnel_index1;
+ vxlan_tunnel_t * t0, * t1;
+ vxlan_tunnel_key_t key0, key1;
+ u32 error0, error1;
+ u32 sw_if_index0, sw_if_index1, len0, len1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ /* udp leaves current_data pointing at the vxlan header */
+ vxlan0 = vlib_buffer_get_current (b0);
+ vxlan1 = vlib_buffer_get_current (b1);
+
+ vlib_buffer_advance
+ (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+ vlib_buffer_advance
+ (b1, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+ ip0 = vlib_buffer_get_current (b0);
+ ip1 = vlib_buffer_get_current (b1);
+
+ /* pop (ip, udp, vxlan) */
+ vlib_buffer_advance
+ (b0, sizeof(*ip0)+sizeof(udp_header_t)+sizeof(*vxlan0));
+ vlib_buffer_advance
+ (b1, sizeof(*ip1)+sizeof(udp_header_t)+sizeof(*vxlan1));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+
+ tunnel_index1 = ~0;
+ error1 = 0;
+
+ key0.src = ip0->src_address.as_u32;
+ key0.vni = vxlan0->vni_reserved;
+
+ if (PREDICT_FALSE (key0.as_u64 != last_key.as_u64))
+ {
+ p0 = hash_get (vxm->vxlan_tunnel_by_key, key0.as_u64);
+
+ if (p0 == 0)
+ {
+ error0 = VXLAN_ERROR_NO_SUCH_TUNNEL;
+ next0 = VXLAN_INPUT_NEXT_DROP;
+ goto trace0;
+ }
+
+ last_key.as_u64 = key0.as_u64;
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (vxm->tunnels, tunnel_index0);
+
+ next0 = t0->decap_next_index;
+ sw_if_index0 = t0->sw_if_index;
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ /* Set input sw_if_index to VXLAN tunnel for learning */
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = sw_if_index0;
+
+ pkts_decapsulated ++;
+ stats_n_packets += 1;
+ stats_n_bytes += len0;
+
+ /* Batch stats increment on the same vxlan tunnel so counter
+ is not incremented per packet */
+ if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index))
+ {
+ stats_n_packets -= 1;
+ stats_n_bytes -= len0;
+ if (stats_n_packets)
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, stats_sw_if_index,
+ stats_n_packets, stats_n_bytes);
+ stats_n_packets = 1;
+ stats_n_bytes = len0;
+ stats_sw_if_index = sw_if_index0;
+ }
+
+ trace0:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->vni = vnet_get_vni (vxlan0);
+ }
+
+ key1.src = ip1->src_address.as_u32;
+ key1.vni = vxlan1->vni_reserved;
+
+ if (PREDICT_FALSE (key1.as_u64 != last_key.as_u64))
+ {
+ p1 = hash_get (vxm->vxlan_tunnel_by_key, key1.as_u64);
+
+ if (p1 == 0)
+ {
+ error1 = VXLAN_ERROR_NO_SUCH_TUNNEL;
+ next1 = VXLAN_INPUT_NEXT_DROP;
+ goto trace1;
+ }
+
+ last_key.as_u64 = key1.as_u64;
+ tunnel_index1 = last_tunnel_index = p1[0];
+ }
+ else
+ tunnel_index1 = last_tunnel_index;
+
+ t1 = pool_elt_at_index (vxm->tunnels, tunnel_index1);
+
+ next1 = t1->decap_next_index;
+ sw_if_index1 = t1->sw_if_index;
+ len1 = vlib_buffer_length_in_chain (vm, b1);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b1);
+
+ /* Set input sw_if_index to VXLAN tunnel for learning */
+ vnet_buffer(b1)->sw_if_index[VLIB_RX] = sw_if_index1;
+
+ pkts_decapsulated ++;
+ stats_n_packets += 1;
+ stats_n_bytes += len1;
+
+ /* Batch stats increment on the same vxlan tunnel so counter
+ is not incremented per packet */
+ if (PREDICT_FALSE (sw_if_index1 != stats_sw_if_index))
+ {
+ stats_n_packets -= 1;
+ stats_n_bytes -= len1;
+ if (stats_n_packets)
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, stats_sw_if_index,
+ stats_n_packets, stats_n_bytes);
+ stats_n_packets = 1;
+ stats_n_bytes = len1;
+ stats_sw_if_index = sw_if_index1;
+ }
+
+ trace1:
+ b1->error = error1 ? node->errors[error1] : 0;
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b1, sizeof (*tr));
+ tr->next_index = next1;
+ tr->error = error1;
+ tr->tunnel_index = tunnel_index1;
+ tr->vni = vnet_get_vni (vxlan1);
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 next0;
+ ip4_header_t * ip0;
+ vxlan_header_t * vxlan0;
+ uword * p0;
+ u32 tunnel_index0;
+ vxlan_tunnel_t * t0;
+ vxlan_tunnel_key_t key0;
+ u32 error0;
+ u32 sw_if_index0, len0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ /* udp leaves current_data pointing at the vxlan header */
+ vxlan0 = vlib_buffer_get_current (b0);
+
+ vlib_buffer_advance
+ (b0, -(word)(sizeof(udp_header_t)+sizeof(ip4_header_t)));
+ ip0 = vlib_buffer_get_current (b0);
+
+ /* pop (ip, udp, vxlan) */
+ vlib_buffer_advance
+ (b0, sizeof(*ip0)+sizeof(udp_header_t)+sizeof(*vxlan0));
+
+ tunnel_index0 = ~0;
+ error0 = 0;
+
+ key0.src = ip0->src_address.as_u32;
+ key0.vni = vxlan0->vni_reserved;
+
+ if (PREDICT_FALSE (key0.as_u64 != last_key.as_u64))
+ {
+ p0 = hash_get (vxm->vxlan_tunnel_by_key, key0.as_u64);
+
+ if (p0 == 0)
+ {
+ error0 = VXLAN_ERROR_NO_SUCH_TUNNEL;
+ next0 = VXLAN_INPUT_NEXT_DROP;
+ goto trace00;
+ }
+
+ last_key.as_u64 = key0.as_u64;
+ tunnel_index0 = last_tunnel_index = p0[0];
+ }
+ else
+ tunnel_index0 = last_tunnel_index;
+
+ t0 = pool_elt_at_index (vxm->tunnels, tunnel_index0);
+
+ next0 = t0->decap_next_index;
+ sw_if_index0 = t0->sw_if_index;
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+
+ /* Required to make the l2 tag push / pop code work on l2 subifs */
+ vnet_update_l2_len (b0);
+
+ /* Set input sw_if_index to VXLAN tunnel for learning */
+ vnet_buffer(b0)->sw_if_index[VLIB_RX] = sw_if_index0;
+
+ pkts_decapsulated ++;
+ stats_n_packets += 1;
+ stats_n_bytes += len0;
+
+ /* Batch stats increment on the same vxlan tunnel so counter
+ is not incremented per packet */
+ if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index))
+ {
+ stats_n_packets -= 1;
+ stats_n_bytes -= len0;
+ if (stats_n_packets)
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, stats_sw_if_index,
+ stats_n_packets, stats_n_bytes);
+ stats_n_packets = 1;
+ stats_n_bytes = len0;
+ stats_sw_if_index = sw_if_index0;
+ }
+
+ trace00:
+ b0->error = error0 ? node->errors[error0] : 0;
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_rx_trace_t *tr
+ = vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->next_index = next0;
+ tr->error = error0;
+ tr->tunnel_index = tunnel_index0;
+ tr->vni = vnet_get_vni (vxlan0);
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+ /* Do we still need this now that tunnel tx stats is kept? */
+ vlib_node_increment_counter (vm, vxlan_input_node.index,
+ VXLAN_ERROR_DECAPSULATED,
+ pkts_decapsulated);
+
+ /* Increment any remaining batch stats */
+ if (stats_n_packets)
+ {
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
+ cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+ node->runtime_data[0] = stats_sw_if_index;
+ }
+
+ return from_frame->n_vectors;
+}
+
+static char * vxlan_error_strings[] = {
+#define vxlan_error(n,s) s,
+#include <vnet/vxlan/vxlan_error.def>
+#undef vxlan_error
+#undef _
+};
+
+VLIB_REGISTER_NODE (vxlan_input_node) = {
+ .function = vxlan_input,
+ .name = "vxlan-input",
+ /* Takes a vector of packets. */
+ .vector_size = sizeof (u32),
+
+ .n_errors = VXLAN_N_ERROR,
+ .error_strings = vxlan_error_strings,
+
+ .n_next_nodes = VXLAN_INPUT_N_NEXT,
+ .next_nodes = {
+#define _(s,n) [VXLAN_INPUT_NEXT_##s] = n,
+ foreach_vxlan_input_next
+#undef _
+ },
+
+//temp .format_buffer = format_vxlan_header,
+ .format_trace = format_vxlan_rx_trace,
+ // $$$$ .unformat_buffer = unformat_vxlan_header,
+};
diff --git a/vnet/vnet/vxlan/encap.c b/vnet/vnet/vxlan/encap.c
new file mode 100644
index 00000000000..4b475f83886
--- /dev/null
+++ b/vnet/vnet/vxlan/encap.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/vxlan/vxlan.h>
+
+/* Statistics (not all errors) */
+#define foreach_vxlan_encap_error \
+_(ENCAPSULATED, "good packets encapsulated") \
+_(DEL_TUNNEL, "deleted tunnel packets")
+
+static char * vxlan_encap_error_strings[] = {
+#define _(sym,string) string,
+ foreach_vxlan_encap_error
+#undef _
+};
+
+typedef enum {
+#define _(sym,str) VXLAN_ENCAP_ERROR_##sym,
+ foreach_vxlan_encap_error
+#undef _
+ VXLAN_ENCAP_N_ERROR,
+} vxlan_encap_error_t;
+
+typedef enum {
+ VXLAN_ENCAP_NEXT_IP4_LOOKUP,
+ VXLAN_ENCAP_NEXT_DROP,
+ VXLAN_ENCAP_N_NEXT,
+} vxlan_encap_next_t;
+
+typedef struct {
+ u32 tunnel_index;
+ u32 vni;
+} vxlan_encap_trace_t;
+
+u8 * format_vxlan_encap_trace (u8 * s, va_list * args)
+{
+ CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
+ CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
+ vxlan_encap_trace_t * t
+ = va_arg (*args, vxlan_encap_trace_t *);
+
+ s = format (s, "VXLAN-ENCAP: tunnel %d vni %d", t->tunnel_index, t->vni);
+ return s;
+}
+
+#define foreach_fixed_header_offset \
+ _(0) _(1) _(2) _(3)
+
+static uword
+vxlan_encap (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * from_frame)
+{
+ u32 n_left_from, next_index, * from, * to_next;
+ vxlan_main_t * vxm = &vxlan_main;
+ vnet_main_t * vnm = vxm->vnet_main;
+ vnet_interface_main_t * im = &vnm->interface_main;
+ u32 pkts_encapsulated = 0;
+ u16 old_l0 = 0, old_l1 = 0;
+ u32 cpu_index = os_get_cpu_number();
+ u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
+
+ from = vlib_frame_vector_args (from_frame);
+ n_left_from = from_frame->n_vectors;
+
+ next_index = node->cached_next_index;
+ stats_sw_if_index = node->runtime_data[0];
+ stats_n_packets = stats_n_bytes = 0;
+
+ while (n_left_from > 0)
+ {
+ u32 n_left_to_next;
+
+ vlib_get_next_frame (vm, node, next_index,
+ to_next, n_left_to_next);
+
+ while (n_left_from >= 4 && n_left_to_next >= 2)
+ {
+ u32 bi0, bi1;
+ vlib_buffer_t * b0, * b1;
+ u32 flow_hash0, flow_hash1;
+ u32 next0 = VXLAN_ENCAP_NEXT_IP4_LOOKUP;
+ u32 next1 = VXLAN_ENCAP_NEXT_IP4_LOOKUP;
+ u32 sw_if_index0, sw_if_index1, len0, len1;
+ vnet_hw_interface_t * hi0, * hi1;
+ ip4_header_t * ip0, * ip1;
+ udp_header_t * udp0, * udp1;
+ u64 * copy_src0, * copy_dst0;
+ u64 * copy_src1, * copy_dst1;
+ u32 * copy_src_last0, * copy_dst_last0;
+ u32 * copy_src_last1, * copy_dst_last1;
+ vxlan_tunnel_t * t0, * t1;
+ u16 new_l0, new_l1;
+ ip_csum_t sum0, sum1;
+
+ /* Prefetch next iteration. */
+ {
+ vlib_buffer_t * p2, * p3;
+
+ p2 = vlib_get_buffer (vm, from[2]);
+ p3 = vlib_get_buffer (vm, from[3]);
+
+ vlib_prefetch_buffer_header (p2, LOAD);
+ vlib_prefetch_buffer_header (p3, LOAD);
+
+ CLIB_PREFETCH (p2->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ CLIB_PREFETCH (p3->data, 2*CLIB_CACHE_LINE_BYTES, LOAD);
+ }
+
+ bi0 = from[0];
+ bi1 = from[1];
+ to_next[0] = bi0;
+ to_next[1] = bi1;
+ from += 2;
+ to_next += 2;
+ n_left_to_next -= 2;
+ n_left_from -= 2;
+
+ b0 = vlib_get_buffer (vm, bi0);
+ b1 = vlib_get_buffer (vm, bi1);
+
+ flow_hash0 = vnet_l2_compute_flow_hash (b0);
+ flow_hash1 = vnet_l2_compute_flow_hash (b1);
+
+ /* 1-wide cache? */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_TX];
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+ hi1 = vnet_get_sup_hw_interface (vnm, sw_if_index1);
+
+ t0 = &vxm->tunnels[hi0->dev_instance];
+ t1 = &vxm->tunnels[hi1->dev_instance];
+
+ /* Check rewrite string and drop packet if tunnel is deleted */
+ if (PREDICT_FALSE(t0->rewrite == vxlan_dummy_rewrite))
+ {
+ next0 = VXLAN_ENCAP_NEXT_DROP;
+ b0->error = node->errors[VXLAN_ENCAP_ERROR_DEL_TUNNEL];
+ pkts_encapsulated --;
+ } /* Still go through normal encap with dummy rewrite */
+ if (PREDICT_FALSE(t1->rewrite == vxlan_dummy_rewrite))
+ {
+ next1 = VXLAN_ENCAP_NEXT_DROP;
+ b1->error = node->errors[VXLAN_ENCAP_ERROR_DEL_TUNNEL];
+ pkts_encapsulated --;
+ } /* Still go through normal encap with dummy rewrite */
+
+ /* IP4 VXLAN header sizeof(ip4_vxlan_header_t) should be 36 octects */
+ ASSERT(vec_len(t0->rewrite) == 36);
+ ASSERT(vec_len(t1->rewrite) == 36);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+ vlib_buffer_advance (b1, -(word)_vec_len(t1->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ ip1 = vlib_buffer_get_current(b1);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+ copy_dst1 = (u64 *) ip1;
+ copy_src1 = (u64 *) t1->rewrite;
+
+ /* Copy first 32 octets 8-bytes at a time */
+#define _(offs) copy_dst0[offs] = copy_src0[offs];
+ foreach_fixed_header_offset;
+#undef _
+#define _(offs) copy_dst1[offs] = copy_src1[offs];
+ foreach_fixed_header_offset;
+#undef _
+
+ /* Last 4 octets. Hopefully gcc will be our friend */
+ copy_dst_last0 = (u32 *)(&copy_dst0[4]);
+ copy_src_last0 = (u32 *)(&copy_src0[4]);
+ copy_dst_last1 = (u32 *)(&copy_dst1[4]);
+ copy_src_last1 = (u32 *)(&copy_src1[4]);
+
+ copy_dst_last0[0] = copy_src_last0[0];
+ copy_dst_last1[0] = copy_src_last1[0];
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ sum1 = ip1->checksum;
+ /* old_l1 always 0, see the rewrite setup */
+ new_l1 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1));
+
+ sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t,
+ length /* changed member */);
+ ip1->checksum = ip_csum_fold (sum1);
+ ip1->length = new_l1;
+
+ /* Fix UDP length */
+ udp0 = (udp_header_t *)(ip0+1);
+ new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
+ - sizeof (*ip0));
+ udp1 = (udp_header_t *)(ip1+1);
+ new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)
+ - sizeof (*ip1));
+
+ udp0->length = new_l0;
+ udp0->src_port = flow_hash0;
+
+ udp1->length = new_l1;
+ udp1->src_port = flow_hash1;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ vnet_buffer(b1)->sw_if_index[VLIB_TX] = t1->encap_fib_index;
+ pkts_encapsulated += 2;
+
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+ len1 = vlib_buffer_length_in_chain (vm, b0);
+ stats_n_packets += 2;
+ stats_n_bytes += len0 + len1;
+
+ /* Batch stats increment on the same vxlan tunnel so counter is not
+ incremented per packet. Note stats are still incremented for deleted
+ and admin-down tunnel where packets are dropped. It is not worthwhile
+ to check for this rare case and affect normal path performance. */
+ if (PREDICT_FALSE ((sw_if_index0 != stats_sw_if_index) ||
+ (sw_if_index0 != stats_sw_if_index)))
+ {
+ stats_n_packets -= 2;
+ stats_n_bytes -= len0 + len1;
+ if (sw_if_index0 == sw_if_index1)
+ {
+ if (stats_n_packets)
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
+ cpu_index, stats_sw_if_index,
+ stats_n_packets, stats_n_bytes);
+ stats_sw_if_index = sw_if_index0;
+ stats_n_packets = 2;
+ stats_n_bytes = len0 + len1;
+ }
+ else
+ {
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
+ cpu_index, sw_if_index0, 1, len0);
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
+ cpu_index, sw_if_index1, 1, len1);
+ }
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - vxm->tunnels;
+ tr->vni = t0->vni;
+ }
+
+ if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b1, sizeof (*tr));
+ tr->tunnel_index = t1 - vxm->tunnels;
+ tr->vni = t1->vni;
+ }
+
+ vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, bi1, next0, next1);
+ }
+
+ while (n_left_from > 0 && n_left_to_next > 0)
+ {
+ u32 bi0;
+ vlib_buffer_t * b0;
+ u32 flow_hash0;
+ u32 next0 = VXLAN_ENCAP_NEXT_IP4_LOOKUP;
+ u32 sw_if_index0, len0;
+ vnet_hw_interface_t * hi0;
+ ip4_header_t * ip0;
+ udp_header_t * udp0;
+ u64 * copy_src0, * copy_dst0;
+ u32 * copy_src_last0, * copy_dst_last0;
+ vxlan_tunnel_t * t0;
+ u16 new_l0;
+ ip_csum_t sum0;
+
+ bi0 = from[0];
+ to_next[0] = bi0;
+ from += 1;
+ to_next += 1;
+ n_left_from -= 1;
+ n_left_to_next -= 1;
+
+ b0 = vlib_get_buffer (vm, bi0);
+
+ flow_hash0 = vnet_l2_compute_flow_hash(b0);
+
+ /* 1-wide cache? */
+ sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_TX];
+ hi0 = vnet_get_sup_hw_interface (vnm, sw_if_index0);
+
+ t0 = &vxm->tunnels[hi0->dev_instance];
+
+ /* Check rewrite string and drop packet if tunnel is deleted */
+ if (PREDICT_FALSE(t0->rewrite == vxlan_dummy_rewrite))
+ {
+ next0 = VXLAN_ENCAP_NEXT_DROP;
+ b0->error = node->errors[VXLAN_ENCAP_ERROR_DEL_TUNNEL];
+ pkts_encapsulated --;
+ } /* Still go through normal encap with dummy rewrite */
+
+ /* IP4 VXLAN header sizeof(ip4_vxlan_header_t) should be 36 octects */
+ ASSERT(vec_len(t0->rewrite) == 36);
+
+ /* Apply the rewrite string. $$$$ vnet_rewrite? */
+ vlib_buffer_advance (b0, -(word)_vec_len(t0->rewrite));
+
+ ip0 = vlib_buffer_get_current(b0);
+ /* Copy the fixed header */
+ copy_dst0 = (u64 *) ip0;
+ copy_src0 = (u64 *) t0->rewrite;
+
+ /* Copy first 32 octets 8-bytes at a time */
+#define _(offs) copy_dst0[offs] = copy_src0[offs];
+ foreach_fixed_header_offset;
+#undef _
+ /* Last 4 octets. Hopefully gcc will be our friend */
+ copy_dst_last0 = (u32 *)(&copy_dst0[4]);
+ copy_src_last0 = (u32 *)(&copy_src0[4]);
+
+ copy_dst_last0[0] = copy_src_last0[0];
+
+ /* fix the <bleep>ing outer-IP checksum */
+ sum0 = ip0->checksum;
+ /* old_l0 always 0, see the rewrite setup */
+ new_l0 =
+ clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0));
+
+ sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t,
+ length /* changed member */);
+ ip0->checksum = ip_csum_fold (sum0);
+ ip0->length = new_l0;
+
+ /* Fix UDP length */
+ udp0 = (udp_header_t *)(ip0+1);
+ new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)
+ - sizeof (*ip0));
+
+ udp0->length = new_l0;
+ udp0->src_port = flow_hash0;
+
+ /* Reset to look up tunnel partner in the configured FIB */
+ vnet_buffer(b0)->sw_if_index[VLIB_TX] = t0->encap_fib_index;
+ pkts_encapsulated ++;
+
+ len0 = vlib_buffer_length_in_chain (vm, b0);
+ stats_n_packets += 1;
+ stats_n_bytes += len0;
+
+ /* Batch stats increment on the same vxlan tunnel so counter is not
+ incremented per packet. Note stats are still incremented for deleted
+ and admin-down tunnel where packets are dropped. It is not worthwhile
+ to check for this rare case and affect normal path performance. */
+ if (PREDICT_FALSE (sw_if_index0 != stats_sw_if_index))
+ {
+ stats_n_packets -= 1;
+ stats_n_bytes -= len0;
+ if (stats_n_packets)
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
+ cpu_index, stats_sw_if_index,
+ stats_n_packets, stats_n_bytes);
+ stats_n_packets = 1;
+ stats_n_bytes = len0;
+ stats_sw_if_index = sw_if_index0;
+ }
+
+ if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
+ {
+ vxlan_encap_trace_t *tr =
+ vlib_add_trace (vm, node, b0, sizeof (*tr));
+ tr->tunnel_index = t0 - vxm->tunnels;
+ tr->vni = t0->vni;
+ }
+ vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+ to_next, n_left_to_next,
+ bi0, next0);
+ }
+
+ vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+ }
+
+ /* Do we still need this now that tunnel tx stats is kept? */
+ vlib_node_increment_counter (vm, node->node_index,
+ VXLAN_ENCAP_ERROR_ENCAPSULATED,
+ pkts_encapsulated);
+
+ /* Increment any remaining batch stats */
+ if (stats_n_packets)
+ {
+ vlib_increment_combined_counter
+ (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
+ cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+ node->runtime_data[0] = stats_sw_if_index;
+ }
+
+ return from_frame->n_vectors;
+}
+
+VLIB_REGISTER_NODE (vxlan_encap_node) = {
+ .function = vxlan_encap,
+ .name = "vxlan-encap",
+ .vector_size = sizeof (u32),
+ .format_trace = format_vxlan_encap_trace,
+ .type = VLIB_NODE_TYPE_INTERNAL,
+
+ .n_errors = ARRAY_LEN(vxlan_encap_error_strings),
+ .error_strings = vxlan_encap_error_strings,
+
+ .n_next_nodes = VXLAN_ENCAP_N_NEXT,
+
+ .next_nodes = {
+ [VXLAN_ENCAP_NEXT_IP4_LOOKUP] = "ip4-lookup",
+ [VXLAN_ENCAP_NEXT_DROP] = "error-drop",
+ },
+};
diff --git a/vnet/vnet/vxlan/vxlan.c b/vnet/vnet/vxlan/vxlan.c
new file mode 100644
index 00000000000..75cbc6c6499
--- /dev/null
+++ b/vnet/vnet/vxlan/vxlan.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vnet/vxlan/vxlan.h>
+
+vxlan_main_t vxlan_main;
+
+static u8 * format_decap_next (u8 * s, va_list * args)
+{
+ u32 next_index = va_arg (*args, u32);
+
+ switch (next_index)
+ {
+ case VXLAN_INPUT_NEXT_DROP:
+ return format (s, "drop");
+ case VXLAN_INPUT_NEXT_L2_INPUT:
+ return format (s, "l2");
+ case VXLAN_INPUT_NEXT_IP4_INPUT:
+ return format (s, "ip4");
+ case VXLAN_INPUT_NEXT_IP6_INPUT:
+ return format (s, "ip6");
+ default:
+ return format (s, "unknown %d", next_index);
+ }
+ return s;
+}
+
+u8 * format_vxlan_tunnel (u8 * s, va_list * args)
+{
+ vxlan_tunnel_t * t = va_arg (*args, vxlan_tunnel_t *);
+ vxlan_main_t * ngm = &vxlan_main;
+
+ s = format (s,
+ "[%d] %U (src) %U (dst) vni %d encap_fib_index %d",
+ t - ngm->tunnels,
+ format_ip4_address, &t->src,
+ format_ip4_address, &t->dst,
+ t->vni,
+ t->encap_fib_index);
+ s = format (s, " decap_next %U\n", format_decap_next, t->decap_next_index);
+ return s;
+}
+
+static u8 * format_vxlan_name (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ return format (s, "vxlan_tunnel%d", dev_instance);
+}
+
+static uword dummy_interface_tx (vlib_main_t * vm,
+ vlib_node_runtime_t * node,
+ vlib_frame_t * frame)
+{
+ clib_warning ("you shouldn't be here, leaking buffers...");
+ return frame->n_vectors;
+}
+
+VNET_DEVICE_CLASS (vxlan_device_class,static) = {
+ .name = "VXLAN",
+ .format_device_name = format_vxlan_name,
+ .format_tx_trace = format_vxlan_encap_trace,
+ .tx_function = dummy_interface_tx,
+};
+
+static uword dummy_set_rewrite (vnet_main_t * vnm,
+ u32 sw_if_index,
+ u32 l3_type,
+ void * dst_address,
+ void * rewrite,
+ uword max_rewrite_bytes)
+{
+ return 0;
+}
+
+static u8 * format_vxlan_header_with_length (u8 * s, va_list * args)
+{
+ u32 dev_instance = va_arg (*args, u32);
+ s = format (s, "unimplemented dev %u", dev_instance);
+ return s;
+}
+
+VNET_HW_INTERFACE_CLASS (vxlan_hw_class) = {
+ .name = "VXLAN",
+ .format_header = format_vxlan_header_with_length,
+ .set_rewrite = dummy_set_rewrite,
+};
+
+#define foreach_copy_field \
+_(src.as_u32) \
+_(dst.as_u32) \
+_(vni) \
+_(encap_fib_index) \
+_(decap_next_index)
+
+static int vxlan_rewrite (vxlan_tunnel_t * t)
+{
+ u8 *rw = 0;
+ ip4_header_t * ip0;
+ ip4_vxlan_header_t * h0;
+ int len = sizeof (*h0);
+
+ vec_validate_aligned (rw, len-1, CLIB_CACHE_LINE_BYTES);
+
+ h0 = (ip4_vxlan_header_t *) rw;
+
+ /* Fixed portion of the (outer) ip4 header */
+ ip0 = &h0->ip4;
+ ip0->ip_version_and_header_length = 0x45;
+ ip0->ttl = 254;
+ ip0->protocol = IP_PROTOCOL_UDP;
+
+ /* we fix up the ip4 header length and checksum after-the-fact */
+ ip0->src_address.as_u32 = t->src.as_u32;
+ ip0->dst_address.as_u32 = t->dst.as_u32;
+ ip0->checksum = ip4_header_checksum (ip0);
+
+ /* UDP header, randomize src port on something, maybe? */
+ h0->udp.src_port = clib_host_to_net_u16 (4789);
+ h0->udp.dst_port = clib_host_to_net_u16 (UDP_DST_PORT_vxlan);
+
+ /* VXLAN header */
+ vnet_set_vni_and_flags(&h0->vxlan, t->vni);
+
+ t->rewrite = rw;
+ return (0);
+}
+
+int vnet_vxlan_add_del_tunnel
+(vnet_vxlan_add_del_tunnel_args_t *a, u32 * sw_if_indexp)
+{
+ vxlan_main_t * vxm = &vxlan_main;
+ vxlan_tunnel_t *t = 0;
+ vnet_main_t * vnm = vxm->vnet_main;
+ vnet_hw_interface_t * hi;
+ uword * p;
+ u32 hw_if_index = ~0;
+ u32 sw_if_index = ~0;
+ int rv;
+ vxlan_tunnel_key_t key;
+
+ key.src = a->dst.as_u32; /* decap src in key is encap dst in config */
+ key.vni = clib_host_to_net_u32 (a->vni << 8);
+
+ p = hash_get (vxm->vxlan_tunnel_by_key, key.as_u64);
+
+ if (a->is_add)
+ {
+ /* adding a tunnel: tunnel must not already exist */
+ if (p)
+ return VNET_API_ERROR_TUNNEL_EXIST;
+
+ if (a->decap_next_index == ~0)
+ a->decap_next_index = VXLAN_INPUT_NEXT_L2_INPUT;
+
+ if (a->decap_next_index >= VXLAN_INPUT_N_NEXT)
+ return VNET_API_ERROR_INVALID_DECAP_NEXT;
+
+ pool_get_aligned (vxm->tunnels, t, CLIB_CACHE_LINE_BYTES);
+ memset (t, 0, sizeof (*t));
+
+ /* copy from arg structure */
+#define _(x) t->x = a->x;
+ foreach_copy_field;
+#undef _
+
+ rv = vxlan_rewrite (t);
+
+ if (rv)
+ {
+ pool_put (vxm->tunnels, t);
+ return rv;
+ }
+
+ hash_set (vxm->vxlan_tunnel_by_key, key.as_u64, t - vxm->tunnels);
+
+ if (vec_len (vxm->free_vxlan_tunnel_hw_if_indices) > 0)
+ {
+ vnet_interface_main_t * im = &vnm->interface_main;
+ hw_if_index = vxm->free_vxlan_tunnel_hw_if_indices
+ [vec_len (vxm->free_vxlan_tunnel_hw_if_indices)-1];
+ _vec_len (vxm->free_vxlan_tunnel_hw_if_indices) -= 1;
+
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->dev_instance = t - vxm->tunnels;
+ hi->hw_instance = hi->dev_instance;
+
+ /* clear old stats of freed tunnel before reuse */
+ sw_if_index = hi->sw_if_index;
+ vnet_interface_counter_lock(im);
+ vlib_zero_combined_counter
+ (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_TX], sw_if_index);
+ vlib_zero_combined_counter
+ (&im->combined_sw_if_counters[VNET_INTERFACE_COUNTER_RX], sw_if_index);
+ vlib_zero_simple_counter
+ (&im->sw_if_counters[VNET_INTERFACE_COUNTER_DROP], sw_if_index);
+ vnet_interface_counter_unlock(im);
+ }
+ else
+ {
+ hw_if_index = vnet_register_interface
+ (vnm, vxlan_device_class.index, t - vxm->tunnels,
+ vxlan_hw_class.index, t - vxm->tunnels);
+ hi = vnet_get_hw_interface (vnm, hw_if_index);
+ hi->output_node_index = vxlan_encap_node.index;
+ }
+
+ t->hw_if_index = hw_if_index;
+ t->sw_if_index = sw_if_index = hi->sw_if_index;
+
+ if (a->decap_next_index == VXLAN_INPUT_NEXT_L2_INPUT)
+ {
+ l2input_main_t * l2im = &l2input_main;
+ /* setup l2 input config with l2 feature and bd 0 to drop packet */
+ vec_validate (l2im->configs, sw_if_index);
+ l2im->configs[sw_if_index].feature_bitmap = L2INPUT_FEAT_DROP;
+ l2im->configs[sw_if_index].bd_index = 0;
+ }
+ vnet_sw_interface_set_flags (vnm, sw_if_index,
+ VNET_SW_INTERFACE_FLAG_ADMIN_UP);
+ }
+ else
+ {
+ /* deleting a tunnel: tunnel must exist */
+ if (!p)
+ return VNET_API_ERROR_NO_SUCH_ENTRY;
+
+ t = pool_elt_at_index (vxm->tunnels, p[0]);
+
+ vnet_sw_interface_set_flags (vnm, t->sw_if_index, 0 /* down */);
+ /* make sure tunnel is removed from l2 bd or xconnect */
+ set_int_l2_mode(vxm->vlib_main, vnm, MODE_L3, t->sw_if_index, 0, 0, 0, 0);
+ vec_add1 (vxm->free_vxlan_tunnel_hw_if_indices, t->hw_if_index);
+
+ hash_unset (vxm->vxlan_tunnel_by_key, key.as_u64);
+
+ vec_free (t->rewrite);
+ t->rewrite = vxlan_dummy_rewrite;
+ pool_put (vxm->tunnels, t);
+ }
+
+ if (sw_if_indexp)
+ *sw_if_indexp = sw_if_index;
+
+ return 0;
+}
+
+static u32 fib_index_from_fib_id (u32 fib_id)
+{
+ ip4_main_t * im = &ip4_main;
+ uword * p;
+
+ p = hash_get (im->fib_index_by_table_id, fib_id);
+ if (!p)
+ return ~0;
+
+ return p[0];
+}
+
+static uword unformat_decap_next (unformat_input_t * input, va_list * args)
+{
+ u32 * result = va_arg (*args, u32 *);
+ u32 tmp;
+
+ if (unformat (input, "l2"))
+ *result = VXLAN_INPUT_NEXT_L2_INPUT;
+ else if (unformat (input, "drop"))
+ *result = VXLAN_INPUT_NEXT_DROP;
+ else if (unformat (input, "ip4"))
+ *result = VXLAN_INPUT_NEXT_IP4_INPUT;
+ else if (unformat (input, "ip6"))
+ *result = VXLAN_INPUT_NEXT_IP6_INPUT;
+ else if (unformat (input, "%d", &tmp))
+ *result = tmp;
+ else
+ return 0;
+ return 1;
+}
+
+static clib_error_t *
+vxlan_add_del_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ unformat_input_t _line_input, * line_input = &_line_input;
+ ip4_address_t src, dst;
+ u8 is_add = 1;
+ u8 src_set = 0;
+ u8 dst_set = 0;
+ u32 encap_fib_index = 0;
+ u32 decap_next_index = ~0;
+ u32 vni = 0;
+ u32 tmp;
+ int rv;
+ vnet_vxlan_add_del_tunnel_args_t _a, * a = &_a;
+
+ /* Get a line of input. */
+ if (! unformat_user (input, unformat_line_input, line_input))
+ return 0;
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) {
+ if (unformat (line_input, "del"))
+ is_add = 0;
+ else if (unformat (line_input, "src %U",
+ unformat_ip4_address, &src))
+ src_set = 1;
+ else if (unformat (line_input, "dst %U",
+ unformat_ip4_address, &dst))
+ dst_set = 1;
+ else if (unformat (line_input, "encap-vrf-id %d", &tmp))
+ {
+ encap_fib_index = fib_index_from_fib_id (tmp);
+ if (encap_fib_index == ~0)
+ return clib_error_return (0, "nonexistent encap-vrf-id %d", tmp);
+ }
+ else if (unformat (line_input, "decap-next %U", unformat_decap_next,
+ &decap_next_index))
+ ;
+ else if (unformat (line_input, "vni %d", &vni))
+ {
+ if (vni >> 24)
+ return clib_error_return (0, "vni %d out of range", vni);
+ }
+ else
+ return clib_error_return (0, "parse error: '%U'",
+ format_unformat_error, line_input);
+ }
+
+ unformat_free (line_input);
+
+ if (src_set == 0)
+ return clib_error_return (0, "tunnel src address not specified");
+
+ if (dst_set == 0)
+ return clib_error_return (0, "tunnel dst address not specified");
+
+ if (vni == 0)
+ return clib_error_return (0, "vni not specified");
+
+ memset (a, 0, sizeof (*a));
+
+ a->is_add = is_add;
+
+#define _(x) a->x = x;
+ foreach_copy_field;
+#undef _
+
+ rv = vnet_vxlan_add_del_tunnel (a, 0 /* hw_if_indexp */);
+
+ switch(rv)
+ {
+ case 0:
+ break;
+ case VNET_API_ERROR_INVALID_DECAP_NEXT:
+ return clib_error_return (0, "invalid decap-next...");
+
+ case VNET_API_ERROR_TUNNEL_EXIST:
+ return clib_error_return (0, "tunnel already exists...");
+
+ case VNET_API_ERROR_NO_SUCH_ENTRY:
+ return clib_error_return (0, "tunnel does not exist...");
+
+ default:
+ return clib_error_return
+ (0, "vnet_vxlan_add_del_tunnel returned %d", rv);
+ }
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (create_vxlan_tunnel_command, static) = {
+ .path = "create vxlan tunnel",
+ .short_help =
+ "create vxlan tunnel src <local-vtep-addr> dst <remote-vtep-addr> vni <nn>"
+ " [encap-vrf-id <nn>] [decap-next [l2|ip4|ip6] [del]\n",
+ .function = vxlan_add_del_tunnel_command_fn,
+};
+
+static clib_error_t *
+show_vxlan_tunnel_command_fn (vlib_main_t * vm,
+ unformat_input_t * input,
+ vlib_cli_command_t * cmd)
+{
+ vxlan_main_t * vxm = &vxlan_main;
+ vxlan_tunnel_t * t;
+
+ if (pool_elts (vxm->tunnels) == 0)
+ vlib_cli_output (vm, "No vxlan tunnels configured...");
+
+ pool_foreach (t, vxm->tunnels,
+ ({
+ vlib_cli_output (vm, "%U", format_vxlan_tunnel, t);
+ }));
+
+ return 0;
+}
+
+VLIB_CLI_COMMAND (show_vxlan_tunnel_command, static) = {
+ .path = "show vxlan tunnel",
+ .function = show_vxlan_tunnel_command_fn,
+};
+
+clib_error_t *vxlan_init (vlib_main_t *vm)
+{
+ vxlan_main_t * vxm = &vxlan_main;
+ ip4_vxlan_header_t * hdr;
+ ip4_header_t * ip;
+
+ vxm->vnet_main = vnet_get_main();
+ vxm->vlib_main = vm;
+
+ /* init dummy rewrite string for deleted vxlan tunnels */
+ _vec_len(vxlan_dummy_rewrite) = sizeof(ip4_vxlan_header_t);
+ hdr = (ip4_vxlan_header_t *) vxlan_dummy_rewrite;
+ ip = &hdr->ip4;
+ /* minimal rewrite setup, see vxlan_rewite() above as reference */
+ ip->ip_version_and_header_length = 0x45;
+ ip->checksum = ip4_header_checksum (ip);
+
+ udp_register_dst_port (vm, UDP_DST_PORT_vxlan,
+ vxlan_input_node.index, 1 /* is_ip4 */);
+ return 0;
+}
+
+VLIB_INIT_FUNCTION(vxlan_init);
+
diff --git a/vnet/vnet/vxlan/vxlan.h b/vnet/vnet/vxlan/vxlan.h
new file mode 100644
index 00000000000..5c82a3dfd9d
--- /dev/null
+++ b/vnet/vnet/vxlan/vxlan.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vnet_vxlan_h
+#define included_vnet_vxlan_h
+
+#include <vppinfra/error.h>
+#include <vppinfra/hash.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/l2/l2_input.h>
+#include <vnet/l2/l2_bd.h>
+#include <vnet/ethernet/ethernet.h>
+#include <vnet/vxlan/vxlan_packet.h>
+#include <vnet/ip/ip4_packet.h>
+#include <vnet/ip/udp.h>
+
+typedef CLIB_PACKED (struct {
+ ip4_header_t ip4; /* 20 bytes */
+ udp_header_t udp; /* 8 bytes */
+ vxlan_header_t vxlan; /* 8 bytes */
+}) ip4_vxlan_header_t;
+
+typedef CLIB_PACKED(struct {
+ /*
+ * Key fields: ip src and vxlan vni on incoming VXLAN packet
+ * all fields in NET byte order
+ */
+ union {
+ struct {
+ u32 src;
+ u32 vni; /* shifted left 8 bits */
+ };
+ u64 as_u64;
+ };
+}) vxlan_tunnel_key_t;
+
+typedef struct {
+ /* Rewrite string. $$$$ embed vnet_rewrite header */
+ u8 * rewrite;
+
+ /* decap next index */
+ u32 decap_next_index;
+
+ /* tunnel src and dst addresses */
+ ip4_address_t src;
+ ip4_address_t dst;
+
+ /* vxlan VNI in HOST byte order */
+ u32 vni;
+
+ /* L3 FIB index and L2 BD ID */
+ u16 encap_fib_index; /* tunnel partner IP lookup here */
+
+ /* vnet intfc hw/sw_if_index */
+ u16 hw_if_index;
+ u32 sw_if_index;
+} vxlan_tunnel_t;
+
+#define foreach_vxlan_input_next \
+_(DROP, "error-drop") \
+_(L2_INPUT, "l2-input") \
+_(IP4_INPUT, "ip4-input") \
+_(IP6_INPUT, "ip6-input")
+
+typedef enum {
+#define _(s,n) VXLAN_INPUT_NEXT_##s,
+ foreach_vxlan_input_next
+#undef _
+ VXLAN_INPUT_N_NEXT,
+} vxlan_input_next_t;
+
+typedef enum {
+#define vxlan_error(n,s) VXLAN_ERROR_##n,
+#include <vnet/vxlan/vxlan_error.def>
+#undef vxlan_error
+ VXLAN_N_ERROR,
+} vxlan_input_error_t;
+
+typedef struct {
+ /* vector of encap tunnel instances */
+ vxlan_tunnel_t *tunnels;
+
+ /* lookup tunnel by key */
+ uword * vxlan_tunnel_by_key;
+
+ /* Free vlib hw_if_indices */
+ u32 * free_vxlan_tunnel_hw_if_indices;
+
+ /* Dummy rewrite for deleted vxlan_tunnels with hw_if_indices as above */
+ u64 dummy_str [sizeof(ip4_vxlan_header_t)/sizeof(u64) + 2];
+#define vxlan_dummy_rewrite ((u8 *) &vxlan_main.dummy_str[1])
+
+ /* convenience */
+ vlib_main_t * vlib_main;
+ vnet_main_t * vnet_main;
+} vxlan_main_t;
+
+vxlan_main_t vxlan_main;
+
+vlib_node_registration_t vxlan_input_node;
+vlib_node_registration_t vxlan_encap_node;
+
+u8 * format_vxlan_encap_trace (u8 * s, va_list * args);
+
+typedef struct {
+ u8 is_add;
+ ip4_address_t src, dst;
+ u32 encap_fib_index;
+ u32 decap_next_index;
+ u32 vni;
+} vnet_vxlan_add_del_tunnel_args_t;
+
+int vnet_vxlan_add_del_tunnel
+(vnet_vxlan_add_del_tunnel_args_t *a, u32 * sw_if_indexp);
+
+#endif /* included_vnet_vxlan_h */
diff --git a/vnet/vnet/vxlan/vxlan_error.def b/vnet/vnet/vxlan/vxlan_error.def
new file mode 100644
index 00000000000..3ead986cca8
--- /dev/null
+++ b/vnet/vnet/vxlan/vxlan_error.def
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+vxlan_error (DECAPSULATED, "good packets decapsulated")
+vxlan_error (NO_SUCH_TUNNEL, "no such tunnel packets")
diff --git a/vnet/vnet/vxlan/vxlan_packet.h b/vnet/vnet/vxlan/vxlan_packet.h
new file mode 100644
index 00000000000..8a9a3b80532
--- /dev/null
+++ b/vnet/vnet/vxlan/vxlan_packet.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_vxlan_packet_h__
+#define __included_vxlan_packet_h__ 1
+
+/*
+ * From RFC-7384
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|R|R|R| Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * VXLAN Header: This is an 8-byte field that has:
+ *
+ * - Flags (8 bits): where the I flag MUST be set to 1 for a valid
+ * VXLAN Network ID (VNI). The other 7 bits (designated "R") are
+ * reserved fields and MUST be set to zero on transmission and
+ * ignored on receipt.
+ *
+ * - VXLAN Segment ID/VXLAN Network Identifier (VNI): this is a
+ * 24-bit value used to designate the individual VXLAN overlay
+ * network on which the communicating VMs are situated. VMs in
+ * different VXLAN overlay networks cannot communicate with each
+ * other.
+ *
+ * - Reserved fields (24 bits and 8 bits): MUST be set to zero on
+ * transmission and ignored on receipt.
+ *
+ */
+
+typedef struct {
+ u32 flags;
+ u32 vni_reserved;
+} vxlan_header_t;
+
+#define VXLAN_FLAGS_VALID_HOST_BYTE_ORDER (1<<27)
+#define VXLAN_FLAGS_VALID_NET_BYTE_ORDER (clib_host_to_net_u32(1<<27))
+
+static inline u32 vnet_get_vni (vxlan_header_t * h)
+{
+ u32 vni_reserved_host_byte_order;
+
+ vni_reserved_host_byte_order = clib_net_to_host_u32 (h->vni_reserved);
+ return vni_reserved_host_byte_order >> 8;
+}
+
+static inline void vnet_set_vni_and_flags (vxlan_header_t * h, u32 vni)
+{
+ h->vni_reserved = clib_host_to_net_u32 (vni<<8);
+ h->flags = VXLAN_FLAGS_VALID_NET_BYTE_ORDER;
+}
+
+#endif /* __included_vxlan_packet_h__ */