aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/kubeproxy/kp.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/kubeproxy/kp.h')
-rw-r--r--src/plugins/kubeproxy/kp.h473
1 files changed, 473 insertions, 0 deletions
diff --git a/src/plugins/kubeproxy/kp.h b/src/plugins/kubeproxy/kp.h
new file mode 100644
index 00000000000..243c002833f
--- /dev/null
+++ b/src/plugins/kubeproxy/kp.h
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2017 Intel and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "POD IS" BPODIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * kp-plugin implements a MagLev-like load balancer.
+ * http://research.google.com/pubs/pub44824.html
+ *
+ * It hasn't been tested for interoperability with the original MagLev
+ * but intends to provide similar functionality.
+ * The kube-proxy receives traffic destined to VIP (Virtual IP)
+ * addresses from one or multiple(ECMP) routers.
+ * The kube-proxy tunnels the traffic toward many application servers
+ * ensuring session stickyness (i.e. that a single sessions is tunneled
+ * towards a single application server).
+ *
+ */
+
+#ifndef KP_PLUGIN_KP_KP_H_
+#define KP_PLUGIN_KP_KP_H_
+
+#include <vnet/util/refcount.h>
+#include <vnet/vnet.h>
+#include <vnet/ip/ip.h>
+#include <vnet/dpo/dpo.h>
+#include <vnet/fib/fib_table.h>
+#include <vppinfra/bihash_8_8.h>
+
+#include <kubeproxy/kphash.h>
+
+#define KP_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10
+#define KP_DEFAULT_FLOW_TIMEOUT 40
+#define KP_MAPPING_BUCKETS 1024
+#define KP_MAPPING_MEMORY_SIZE 64<<20
+
+typedef enum {
+ KP_NEXT_DROP,
+ KP_N_NEXT,
+} kp_next_t;
+
+typedef enum {
+ KP_NAT4_IN2OUT_NEXT_DROP,
+ KP_NAT4_IN2OUT_NEXT_LOOKUP,
+ KP_NAT4_IN2OUT_N_NEXT,
+} kp_nat4_in2out_next_t;
+
+#define foreach_kp_nat_in2out_error \
+_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \
+_(IN2OUT_PACKETS, "Good in2out packets processed") \
+_(NO_TRANSLATION, "No translation")
+
+typedef enum {
+#define _(sym,str) KP_NAT_IN2OUT_ERROR_##sym,
+ foreach_kp_nat_in2out_error
+#undef _
+ KP_NAT_IN2OUT_N_ERROR,
+} kp_nat_in2out_error_t;
+
+/**
+ * kube-proxy supports three types of service
+ */
+typedef enum {
+ KP_SVR_TYPE_VIP_PORT,
+ KP_SVR_TYPE_NODEIP_PORT,
+ KP_SVR_TYPE_EXT_LB,
+ KP_SVR_N_TYPES,
+} kp_svr_type_t;
+
+typedef enum {
+ KP_NODEPORT_NEXT_IP4_NAT4,
+ KP_NODEPORT_NEXT_IP4_NAT6,
+ KP_NODEPORT_NEXT_IP6_NAT4,
+ KP_NODEPORT_NEXT_IP6_NAT6,
+ KP_NODEPORT_NEXT_DROP,
+ KP_NODEPORT_N_NEXT,
+} kp_nodeport_next_t;
+
+/**
+ * Each VIP is configured with a set of PODs
+ */
+typedef struct {
+ /**
+ * Registration to FIB event.
+ */
+ fib_node_t fib_node;
+
+ /**
+ * Destination address used to transfer traffic towards to that POD.
+ * The address is also used pod ID and pseudo-random
+ * seed for the load-balancing process.
+ */
+ ip46_address_t address;
+
+ /**
+ * PODs are indexed by address and VIP Index.
+ * Which means there will be duplicated if the same server
+ * address is used for multiple VIPs.
+ */
+ u32 vip_index;
+
+ /**
+ * Some per-POD flags.
+ * For now only KP_POD_FLAGS_USED is defined.
+ */
+ u8 flags;
+
+#define KP_POD_FLAGS_USED 0x1
+
+ /**
+ * Rotating timestamp of when KP_POD_FLAGS_USED flag was last set.
+ *
+ * POD removal is based on garbage collection and reference counting.
+ * When an POD is removed, there is a race between configuration core
+ * and worker cores which may still add a reference while it should not
+ * be used. This timestamp is used to not remove the POD while a race condition
+ * may happen.
+ */
+ u32 last_used;
+
+ /**
+ * The FIB entry index for the next-hop
+ */
+ fib_node_index_t next_hop_fib_entry_index;
+
+ /**
+ * The child index on the FIB entry
+ */
+ u32 next_hop_child_index;
+
+ /**
+ * The next DPO in the graph to follow.
+ */
+ dpo_id_t dpo;
+
+} kp_pod_t;
+
+format_function_t format_kp_pod;
+
+typedef struct {
+ u32 pod_index;
+} kp_new_flow_entry_t;
+
+#define kp_foreach_vip_counter \
+ _(NEXT_PACKET, "packet from existing sessions", 0) \
+ _(FIRST_PACKET, "first session packet", 1) \
+ _(UNTRACKED_PACKET, "untracked packet", 2) \
+ _(NO_SERVER, "no server configured", 3)
+
+typedef enum {
+#define _(a,b,c) KP_VIP_COUNTER_##a = c,
+ kp_foreach_vip_counter
+#undef _
+ KP_N_VIP_COUNTERS
+} kp_vip_counter_t;
+
+/**
+ * kube-proxy supports IPv4 and IPv6 traffic
+ * and NAT4 and NAT6.
+ */
+typedef enum {
+ KP_VIP_TYPE_IP4_NAT44,
+ KP_VIP_TYPE_IP4_NAT46,
+ KP_VIP_TYPE_IP6_NAT64,
+ KP_VIP_TYPE_IP6_NAT66,
+ KP_VIP_N_TYPES,
+} kp_vip_type_t;
+
+format_function_t format_kp_vip_type;
+unformat_function_t unformat_kp_vip_type;
+
+/**
+ * Load balancing service is provided per VIP.
+ * In this data model, a VIP can be a whole prefix.
+ * But load balancing only
+ * occurs on a per-source-address/port basis. Meaning that if a given source
+ * reuses the same port for multiple destinations within the same VIP,
+ * they will be considered as a single flow.
+ */
+typedef struct {
+
+ //Runtime
+
+ /**
+ * Vector mapping (flow-hash & new_connect_table_mask) to POD index.
+ * This is used for new flows.
+ */
+ kp_new_flow_entry_t *new_flow_table;
+
+ /**
+ * New flows table length - 1
+ * (length MUST be a power of 2)
+ */
+ u32 new_flow_table_mask;
+
+ /**
+ * last time garbage collection was run to free the PODs.
+ */
+ u32 last_garbage_collection;
+
+ //Not runtime
+
+ /**
+ * A Virtual IP represents a given service delivered
+ * by a set of PODs. It can be a single
+ * address or a prefix.
+ * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address
+ * (i.e. ::/96 prefix).
+ */
+ ip46_address_t prefix;
+
+ /**
+ * The VIP prefix length.
+ * In case of IPv4, plen = 96 + ip4_plen.
+ */
+ u8 plen;
+
+ /**
+ * Service port. network byte order
+ */
+ u16 port;
+
+ /**
+ * Pod's port corresponding to specific service. network byte order
+ */
+ u16 target_port;
+
+ /**
+ * Node's port, can access service via NodeIP:node_port. network byte order
+ */
+ u16 node_port;
+
+
+ /**
+ * The type of traffic for this.
+ * KP_TYPE_UNDEFINED if unknown.
+ */
+ kp_vip_type_t type;
+
+ /**
+ * Flags related to this VIP.
+ * KP_VIP_FLAGS_USED means the VIP is active.
+ * When it is not set, the VIP in the process of being removed.
+ * We cannot immediately remove a VIP because the VIP index still may be stored
+ * in the adjacency index.
+ */
+ u8 flags;
+#define KP_VIP_FLAGS_USED 0x1
+
+ /**
+ * Pool of POD indexes used for this VIP.
+ * This also includes PODs that have been removed (but are still referenced).
+ */
+ u32 *pod_indexes;
+
+} kp_vip_t;
+
+/*
+ * mapping from nodeport to vip_index
+ */
+typedef struct {
+
+ u32 vip_index;
+
+} kp_nodeport_t;
+
+#define kp_vip_is_ip4(vip) ((vip)->type == KP_VIP_TYPE_IP4_NAT44 \
+ || (vip)->type == KP_VIP_TYPE_IP4_NAT46)
+#define kp_vip_is_nat4(vip) ((vip)->type == KP_VIP_TYPE_IP6_NAT64 \
+ || (vip)->type == KP_VIP_TYPE_IP4_NAT44)
+format_function_t format_kp_vip;
+format_function_t format_kp_vip_detailed;
+
+#define foreach_kp_nat_protocol \
+ _(UDP, 0, udp, "udp") \
+ _(TCP, 1, tcp, "tcp")
+
+typedef enum {
+#define _(N, i, n, s) KP_NAT_PROTOCOL_##N = i,
+ foreach_kp_nat_protocol
+#undef _
+} kp_nat_protocol_t;
+
+always_inline u32
+kp_ip_proto_to_nat_proto (u8 ip_proto)
+{
+ u32 nat_proto = ~0;
+
+ nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? KP_NAT_PROTOCOL_UDP : nat_proto;
+ nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? KP_NAT_PROTOCOL_TCP : nat_proto;
+
+ return nat_proto;
+}
+
+/* Key for Pod's egress SNAT */
+typedef struct {
+ union
+ {
+ struct
+ {
+ ip4_address_t addr;
+ u16 port;
+ u16 protocol:3,
+ fib_index:13;
+ };
+ u64 as_u64;
+ };
+} kp_snat4_key_t;
+
+typedef struct
+{
+ ip6_address_t prefix;
+ u8 plen;
+ u32 vrf_id;
+ u32 fib_index;
+} kp_snat6_key_t;
+
+typedef struct {
+ kp_svr_type_t svr_type;
+ ip46_address_t vip;
+ ip46_address_t node_ip;
+ ip46_address_t pod_ip;
+ u8 vip_is_ipv6;
+ u8 node_ip_is_ipv6;
+ u8 pod_ip_is_ipv6;
+ u16 port; /* Network byte order */
+ u16 node_port; /* Network byte order */
+ u16 target_port; /* Network byte order */
+ u32 vrf_id;
+ u32 fib_index;
+} kp_snat_mapping_t;
+
+typedef struct {
+ /**
+ * Each CPU has its own sticky flow hash table.
+ * One single table is used for all VIPs.
+ */
+ kp_hash_t *sticky_ht;
+
+} kp_per_cpu_t;
+
+typedef struct {
+ /**
+ * Pool of all Virtual IPs
+ */
+ kp_vip_t *vips;
+
+ /**
+ * Pool of PODs.
+ * PODs are referenced by address and vip index.
+ * The first element (index 0) is special and used only to fill
+ * new_flow_tables when no POD has been configured.
+ */
+ kp_pod_t *pods;
+
+ /**
+ * Each POD has an associated reference counter.
+ * As pods[0] has a special meaning, its associated counter
+ * starts at 0 and is decremented instead. i.e. do not use it.
+ */
+ vlib_refcount_t pod_refcount;
+
+ /* hash lookup vip_index by key: {u16: nodeport} */
+ uword * nodeport_by_key;
+
+
+ /**
+ * Some global data is per-cpu
+ */
+ kp_per_cpu_t *per_cpu;
+
+ /**
+ * Node next index for IP adjacencies, for each of the traffic types.
+ */
+ u32 ip_lookup_next_index[KP_VIP_N_TYPES];
+
+ /**
+ * Number of buckets in the per-cpu sticky hash table.
+ */
+ u32 per_cpu_sticky_buckets;
+
+ /**
+ * Flow timeout in seconds.
+ */
+ u32 flow_timeout;
+
+ /**
+ * Per VIP counter
+ */
+ vlib_simple_counter_main_t vip_counters[KP_N_VIP_COUNTERS];
+
+ /**
+ * DPO used to send packet from IP4/6 lookup to KP node.
+ */
+ dpo_type_t dpo_nat4_type;
+ dpo_type_t dpo_nat6_type;
+
+ /**
+ * Node type for registering to fib changes.
+ */
+ fib_node_type_t fib_node_type;
+
+ /* Find a static mapping by pod IP : target_port */
+ clib_bihash_8_8_t mapping_by_pod;
+
+ /* Static mapping pool */
+ kp_snat_mapping_t * snat_mappings;
+
+ /**
+ * API dynamically registered base ID.
+ */
+ u16 msg_id_base;
+
+ volatile u32 *writer_lock;
+
+ /* convenience */
+ vlib_main_t *vlib_main;
+ vnet_main_t *vnet_main;
+} kp_main_t;
+
+#define ip46_address_type(ip46) (ip46_address_is_ip4(ip46)?IP46_TYPE_IP4:IP46_TYPE_IP6)
+#define ip46_prefix_is_ip4(ip46, len) ((len) >= 96 && ip46_address_is_ip4(ip46))
+#define ip46_prefix_type(ip46, len) (ip46_prefix_is_ip4(ip46, len)?IP46_TYPE_IP4:IP46_TYPE_IP6)
+
+void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen);
+uword unformat_ip46_prefix (unformat_input_t * input, va_list * args);
+u8 *format_ip46_prefix (u8 * s, va_list * args);
+
+
+extern kp_main_t kp_main;
+extern vlib_node_registration_t kp4_node;
+extern vlib_node_registration_t kp6_node;
+extern vlib_node_registration_t kp4_nodeport_node;
+extern vlib_node_registration_t kp6_nodeport_node;
+extern vlib_node_registration_t kp_nat4_in2out_node;
+
+/**
+ * Fix global kube-proxy parameters.
+ * @return 0 on success. VNET_KP_ERR_XXX on error
+ */
+int kp_conf(u32 sticky_buckets, u32 flow_timeout);
+
+int kp_vip_add(ip46_address_t *prefix, u8 plen, kp_vip_type_t type,
+ u32 new_length, u32 *vip_index,
+ u16 port, u16 target_port, u16 node_port);
+int kp_vip_del(u32 vip_index);
+
+int kp_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index);
+
+#define kp_vip_get_by_index(index) (pool_is_free_index(kp_main.vips, index)?NULL:pool_elt_at_index(kp_main.vips, index))
+
+int kp_vip_add_pods(u32 vip_index, ip46_address_t *addresses, u32 n);
+int kp_vip_del_pods(u32 vip_index, ip46_address_t *addresses, u32 n);
+
+u32 kp_hash_time_now(vlib_main_t * vm);
+
+void kp_garbage_collection();
+
+int kp_nat4_interface_add_del (u32 sw_if_index, int is_del);
+
+format_function_t format_kp_main;
+
+#endif /* KP_PLUGIN_KP_KP_H_ */