/*
 * Copyright (c) 2016-2019 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef included_tcp_packet_h
#define included_tcp_packet_h

#include <vnet/vnet.h>

/* TCP flags bit 0 first. */
#define foreach_tcp_flag                                \
  _ (FIN) /**< No more data from sender. */             \
  _ (SYN) /**< Synchronize sequence numbers. */         \
  _ (RST) /**< Reset the connection. */                 \
  _ (PSH) /**< Push function. */                        \
  _ (ACK) /**< Ack field significant. */                \
  _ (URG) /**< Urgent pointer field significant. */     \
  _ (ECE) /**< ECN-echo. Receiver got CE packet */      \
  _ (CWR) /**< Sender reduced congestion window */

enum
{
#define _(f) TCP_FLAG_BIT_##f,
  foreach_tcp_flag
#undef _
    TCP_N_FLAG_BITS,
};

enum
{
#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f,
  foreach_tcp_flag
#undef _
};

typedef struct _tcp_header
{
  union
  {
    struct
    {
      u16 src_port; /**< Source port. */
      u16 dst_port; /**< Destination port. */
    };
    struct
    {
      u16 src, dst;
    };
  };

  u32 seq_number;	/**< Sequence number of the first data octet in this
                         *   segment, except when SYN is present. If SYN
                         *   is present the seq number is is the ISN and the
                         *   first data octet is ISN+1 */
  u32 ack_number;	/**< Acknowledgement number if ACK is set. It contains
                         *   the value of the next sequence number the sender
                         *   of the segment is expecting to receive. */
  u8 data_offset_and_reserved;
  u8 flags;		/**< Flags: see the macro above */
  u16 window;		/**< Number of bytes sender is willing to receive. */

  u16 checksum;		/**< Checksum of TCP pseudo header and data. */
  u16 urgent_pointer;	/**< Seq number of the byte after the urgent data. */
} __attribute__ ((packed)) tcp_header_t;

/* Flag tests that return 0 or !0 */
#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4)
#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN)
#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN)
#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST)
#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH)
#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK)
#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG)
#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE)
#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR)

/* Flag tests that return 0 or 1 */
#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN)
#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN)

always_inline int
tcp_header_bytes (tcp_header_t * t)
{
  return tcp_doff (t) * sizeof (u32);
}

/*
 * TCP options.
 */

typedef enum tcp_option_type
{
  TCP_OPTION_EOL = 0,			/**< End of options. */
  TCP_OPTION_NOOP = 1,			/**< No operation. */
  TCP_OPTION_MSS = 2,			/**< Limit MSS. */
  TCP_OPTION_WINDOW_SCALE = 3,		/**< Window scale. */
  TCP_OPTION_SACK_PERMITTED = 4,	/**< Selective Ack permitted. */
  TCP_OPTION_SACK_BLOCK = 5,		/**< Selective Ack block. */
  TCP_OPTION_TIMESTAMP = 8,		/**< Timestamps. */
  TCP_OPTION_UTO = 28,			/**< User timeout. */
  TCP_OPTION_AO = 29,			/**< Authentication Option. */
} tcp_option_type_t;

#define foreach_tcp_options_flag                                        \
  _ (MSS)               /**< MSS advertised in SYN */                   \
  _ (TSTAMP)            /**< Timestamp capability advertised in SYN */  \
  _ (WSCALE)            /**< Wnd scale capability advertised in SYN */  \
  _ (SACK_PERMITTED)    /**< SACK capability advertised in SYN */       \
  _ (SACK)		/**< SACK present */

enum
{
#define _(f) TCP_OPTS_FLAG_BIT_##f,
  foreach_tcp_options_flag
#undef _
    TCP_OPTIONS_N_FLAG_BITS,
};

enum
{
#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f,
  foreach_tcp_options_flag
#undef _
};

typedef struct _sack_block
{
  u32 start;		/**< Start sequence number */
  u32 end;		/**< End sequence number (first outside) */
} sack_block_t;

typedef struct
{
  sack_block_t *sacks;	/**< SACK blocks */
  u32 tsval;		/**< Timestamp value */
  u32 tsecr;		/**< Echoed/reflected time stamp */
  u16 mss;		/**< Maximum segment size advertised */
  u8 flags;		/**< Option flags, see above */
  u8 wscale;		/**< Window scale advertised */
  u8 n_sack_blocks;	/**< Number of SACKs blocks */
} tcp_options_t;

/* Flag tests that return 0 or !0 */
#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS)
#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP)
#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE)
#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK)
#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED)

/* TCP option lengths */
#define TCP_OPTION_LEN_EOL              1
#define TCP_OPTION_LEN_NOOP             1
#define TCP_OPTION_LEN_MSS              4
#define TCP_OPTION_LEN_WINDOW_SCALE     3
#define TCP_OPTION_LEN_SACK_PERMITTED   2
#define TCP_OPTION_LEN_TIMESTAMP        10
#define TCP_OPTION_LEN_SACK_BLOCK        8

#define TCP_HDR_LEN_MAX			60
#define TCP_WND_MAX                     65535U
#define TCP_MAX_WND_SCALE               14	/* See RFC 1323 */
#define TCP_OPTS_ALIGN                  4
#define TCP_OPTS_MAX_SACK_BLOCKS        3
#define TCP_MAX_GSO_SZ 			65536

/* Modulo arithmetic for TCP sequence numbers */
#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))

/* Modulo arithmetic for timestamps */
#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)

/**
 * Parse TCP header options.
 *
 * @param th TCP header
 * @param to TCP options data structure to be populated
 * @param is_syn set if packet is syn
 * @return -1 if parsing failed
 */
always_inline int
tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn)
{
  const u8 *data;
  u8 opt_len, opts_len, kind;
  int j;
  sack_block_t b;

  opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
  data = (const u8 *) (th + 1);

  /* Zero out all flags but those set in SYN */
  to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
		| TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);

  for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
    {
      kind = data[0];

      /* Get options length */
      if (kind == TCP_OPTION_EOL)
	break;
      else if (kind == TCP_OPTION_NOOP)
	{
	  opt_len = 1;
	  continue;
	}
      else
	{
	  /* broken options */
	  if (opts_len < 2)
	    return -1;
	  opt_len = data[1];

	  /* weird option length */
	  if (opt_len < 2 || opt_len > opts_len)
	    return -1;
	}

      /* Parse options */
      switch (kind)
	{
	case TCP_OPTION_MSS:
	  if (!is_syn)
	    break;
	  if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
	    {
	      to->flags |= TCP_OPTS_FLAG_MSS;
	      to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
	    }
	  break;
	case TCP_OPTION_WINDOW_SCALE:
	  if (!is_syn)
	    break;
	  if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
	    {
	      to->flags |= TCP_OPTS_FLAG_WSCALE;
	      to->wscale = data[2];
	      if (to->wscale > TCP_MAX_WND_SCALE)
		to->wscale = TCP_MAX_WND_SCALE;
	    }
	  break;
	case TCP_OPTION_TIMESTAMP:
	  if (is_syn)
	    to->flags |= TCP_OPTS_FLAG_TSTAMP;
	  if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
	      && opt_len == TCP_OPTION_LEN_TIMESTAMP)
	    {
	      to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
	      to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
	    }
	  break;
	case TCP_OPTION_SACK_PERMITTED:
	  if (!is_syn)
	    break;
	  if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
	    to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
	  break;
	case TCP_OPTION_SACK_BLOCK:
	  /* If SACK permitted was not advertised or a SYN, break */
	  if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
	    break;

	  /* If too short or not correctly formatted, break */
	  if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
	    break;

	  to->flags |= TCP_OPTS_FLAG_SACK;
	  to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
	  vec_reset_length (to->sacks);
	  for (j = 0; j < to->n_sack_blocks; j++)
	    {
	      b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
	      b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
	      vec_add1 (to->sacks, b);
	    }
	  break;
	default:
	  /* Nothing to see here */
	  continue;
	}
    }
  return 0;
}

/**
 * Write TCP options to segment.
 *
 * @param data	buffer where to write the options
 * @param opts	options to write
 * @return	length of options written
 */
always_inline u32
tcp_options_write (u8 * data, tcp_options_t * opts)
{
  u32 opts_len = 0;
  u32 buf, seq_len = 4;

  if (tcp_opts_mss (opts))
    {
      *data++ = TCP_OPTION_MSS;
      *data++ = TCP_OPTION_LEN_MSS;
      buf = clib_host_to_net_u16 (opts->mss);
      clib_memcpy_fast (data, &buf, sizeof (opts->mss));
      data += sizeof (opts->mss);
      opts_len += TCP_OPTION_LEN_MSS;
    }

  if (tcp_opts_wscale (opts))
    {
      *data++ = TCP_OPTION_WINDOW_SCALE;
      *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
      *data++ = opts->wscale;
      opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
    }

  if (tcp_opts_sack_permitted (opts))
    {
      *data++ = TCP_OPTION_SACK_PERMITTED;
      *data++ = TCP_OPTION_LEN_SACK_PERMITTED;
      opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
    }

  if (tcp_opts_tstamp (opts))
    {
      *data++ = TCP_OPTION_TIMESTAMP;
      *data++ = TCP_OPTION_LEN_TIMESTAMP;
      buf = clib_host_to_net_u32 (opts->tsval);
      clib_memcpy_fast (data, &buf, sizeof (opts->tsval));
      data += sizeof (opts->tsval);
      buf = clib_host_to_net_u32 (opts->tsecr);
      clib_memcpy_fast (data, &buf, sizeof (opts->tsecr));
      data += sizeof (opts->tsecr);
      opts_len += TCP_OPTION_LEN_TIMESTAMP;
    }

  if (tcp_opts_sack (opts))
    {
      int i;

      if (opts->n_sack_blocks != 0)
	{
	  *data++ = TCP_OPTION_SACK_BLOCK;
	  *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
	  for (i = 0; i < opts->n_sack_blocks; i++)
	    {
	      buf = clib_host_to_net_u32 (opts->sacks[i].start);
	      clib_memcpy_fast (data, &buf, seq_len);
	      data += seq_len;
	      buf = clib_host_to_net_u32 (opts->sacks[i].end);
	      clib_memcpy_fast (data, &buf, seq_len);
	      data += seq_len;
	    }
	  opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
	}
    }

  /* Terminate TCP options by padding with NOPs to a u32 boundary. Avoid using
   * EOL because, it seems, it can break peers with broken option parsers that
   * rely on options ending on a u32 boundary.
   */
  while (opts_len % 4)
    {
      *data++ = TCP_OPTION_NOOP;
      opts_len += TCP_OPTION_LEN_NOOP;
    }
  return opts_len;
}

#endif /* included_tcp_packet_h */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */