/* * Copyright (c) 2017 Intel and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "POD IS" BPODIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * kp-plugin implements a MagLev-like load balancer. * http://research.google.com/pubs/pub44824.html * * It hasn't been tested for interoperability with the original MagLev * but intends to provide similar functionality. * The kube-proxy receives traffic destined to VIP (Virtual IP) * addresses from one or multiple(ECMP) routers. * The kube-proxy tunnels the traffic toward many application servers * ensuring session stickyness (i.e. that a single sessions is tunneled * towards a single application server). * */ #ifndef KP_PLUGIN_KP_KP_H_ #define KP_PLUGIN_KP_KP_H_ #include #include #include #include #include #include #include #define KP_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10 #define KP_DEFAULT_FLOW_TIMEOUT 40 #define KP_MAPPING_BUCKETS 1024 #define KP_MAPPING_MEMORY_SIZE 64<<20 typedef enum { KP_NEXT_DROP, KP_N_NEXT, } kp_next_t; typedef enum { KP_NAT4_IN2OUT_NEXT_DROP, KP_NAT4_IN2OUT_NEXT_LOOKUP, KP_NAT4_IN2OUT_N_NEXT, } kp_nat4_in2out_next_t; #define foreach_kp_nat_in2out_error \ _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ _(IN2OUT_PACKETS, "Good in2out packets processed") \ _(NO_TRANSLATION, "No translation") typedef enum { #define _(sym,str) KP_NAT_IN2OUT_ERROR_##sym, foreach_kp_nat_in2out_error #undef _ KP_NAT_IN2OUT_N_ERROR, } kp_nat_in2out_error_t; /** * kube-proxy supports three types of service */ typedef enum { KP_SVR_TYPE_VIP_PORT, KP_SVR_TYPE_NODEIP_PORT, KP_SVR_TYPE_EXT_LB, KP_SVR_N_TYPES, } kp_svr_type_t; typedef enum { KP_NODEPORT_NEXT_IP4_NAT4, KP_NODEPORT_NEXT_IP4_NAT6, KP_NODEPORT_NEXT_IP6_NAT4, KP_NODEPORT_NEXT_IP6_NAT6, KP_NODEPORT_NEXT_DROP, KP_NODEPORT_N_NEXT, } kp_nodeport_next_t; /** * Each VIP is configured with a set of PODs */ typedef struct { /** * Registration to FIB event. */ fib_node_t fib_node; /** * Destination address used to transfer traffic towards to that POD. * The address is also used pod ID and pseudo-random * seed for the load-balancing process. */ ip46_address_t address; /** * PODs are indexed by address and VIP Index. * Which means there will be duplicated if the same server * address is used for multiple VIPs. */ u32 vip_index; /** * Some per-POD flags. * For now only KP_POD_FLAGS_USED is defined. */ u8 flags; #define KP_POD_FLAGS_USED 0x1 /** * Rotating timestamp of when KP_POD_FLAGS_USED flag was last set. * * POD removal is based on garbage collection and reference counting. * When an POD is removed, there is a race between configuration core * and worker cores which may still add a reference while it should not * be used. This timestamp is used to not remove the POD while a race condition * may happen. */ u32 last_used; /** * The FIB entry index for the next-hop */ fib_node_index_t next_hop_fib_entry_index; /** * The child index on the FIB entry */ u32 next_hop_child_index; /** * The next DPO in the graph to follow. */ dpo_id_t dpo; } kp_pod_t; format_function_t format_kp_pod; typedef struct { u32 pod_index; } kp_new_flow_entry_t; #define kp_foreach_vip_counter \ _(NEXT_PACKET, "packet from existing sessions", 0) \ _(FIRST_PACKET, "first session packet", 1) \ _(UNTRACKED_PACKET, "untracked packet", 2) \ _(NO_SERVER, "no server configured", 3) typedef enum { #define _(a,b,c) KP_VIP_COUNTER_##a = c, kp_foreach_vip_counter #undef _ KP_N_VIP_COUNTERS } kp_vip_counter_t; /** * kube-proxy supports IPv4 and IPv6 traffic * and NAT4 and NAT6. */ typedef enum { KP_VIP_TYPE_IP4_NAT44, KP_VIP_TYPE_IP4_NAT46, KP_VIP_TYPE_IP6_NAT64, KP_VIP_TYPE_IP6_NAT66, KP_VIP_N_TYPES, } kp_vip_type_t; format_function_t format_kp_vip_type; unformat_function_t unformat_kp_vip_type; /** * Load balancing service is provided per VIP. * In this data model, a VIP can be a whole prefix. * But load balancing only * occurs on a per-source-address/port basis. Meaning that if a given source * reuses the same port for multiple destinations within the same VIP, * they will be considered as a single flow. */ typedef struct { //Runtime /** * Vector mapping (flow-hash & new_connect_table_mask) to POD index. * This is used for new flows. */ kp_new_flow_entry_t *new_flow_table; /** * New flows table length - 1 * (length MUST be a power of 2) */ u32 new_flow_table_mask; /** * last time garbage collection was run to free the PODs. */ u32 last_garbage_collection; //Not runtime /** * A Virtual IP represents a given service delivered * by a set of PODs. It can be a single * address or a prefix. * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address * (i.e. ::/96 prefix). */ ip46_address_t prefix; /** * The VIP prefix length. * In case of IPv4, plen = 96 + ip4_plen. */ u8 plen; /** * Service port. network byte order */ u16 port; /** * Pod's port corresponding to specific service. network byte order */ u16 target_port; /** * Node's port, can access service via NodeIP:node_port. network byte order */ u16 node_port; /** * The type of traffic for this. * KP_TYPE_UNDEFINED if unknown. */ kp_vip_type_t type; /** * Flags related to this VIP. * KP_VIP_FLAGS_USED means the VIP is active. * When it is not set, the VIP in the process of being removed. * We cannot immediately remove a VIP because the VIP index still may be stored * in the adjacency index. */ u8 flags; #define KP_VIP_FLAGS_USED 0x1 /** * Pool of POD indexes used for this VIP. * This also includes PODs that have been removed (but are still referenced). */ u32 *pod_indexes; } kp_vip_t; /* * mapping from nodeport to vip_index */ typedef struct { u32 vip_index; } kp_nodeport_t; #define kp_vip_is_ip4(vip) ((vip)->type == KP_VIP_TYPE_IP4_NAT44 \ || (vip)->type == KP_VIP_TYPE_IP4_NAT46) #define kp_vip_is_nat4(vip) ((vip)->type == KP_VIP_TYPE_IP6_NAT64 \