aboutsummaryrefslogtreecommitdiffstats
path: root/csit.infra.etl/coverage_reconf_rls2402.py
blob: dc1f647ff18c3d9d15b287dfd886a3cbca8f6f30 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3

# Copyright (c) 2023 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ETL script running on top of the s3://"""

from datetime import datetime, timedelta
from json import load
from os import environ
from pytz import utc

import awswrangler as wr
from awswrangler.exceptions import EmptyDataFrame
from awsglue.context import GlueContext
from boto3 import session
from pyspark.context import SparkContext
from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType


S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
    "suite.info.json.gz",
    "setup.info.json.gz",
    "teardown.info.json.gz",
    "suite.output.info.json.gz",
    "setup.output.info.json.gz",
    "teardown.output.info.json.gz"
]
LAST_MODIFIED_END=utc.localize(
    datetime.strptime(
        f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}",
        "%Y-%m-%d"
    )
)
LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1)


def flatten_frame(nested_sdf):
    """Unnest Spark DataFrame in case there nested structered columns.

    :param nested_sdf: Spark DataFrame.
    :type nested_sdf: DataFrame
    :returns: Unnest DataFrame.
    :rtype: DataFrame
    """
    stack = [((), nested_sdf)]
    columns = []
    while len(stack) > 0:
        parents, sdf = stack.pop()
        for column_name, column_type in sdf.dtypes:
            if column_type[:6] == "struct":
                projected_sdf = sdf.select(column_name + ".*")
                stack.append((parents + (column_name,), projected_sdf))
            else:
                columns.append(
                    col(".".join(parents + (column_name,))) \
                        .alias("_".join(parents + (column_name,)))
                )
    return nested_sdf.select(columns)


def process_json_to_dataframe(schema_name, paths):
    """Processes JSON to Spark DataFrame.

    :param schema_name: Schema name.
    :type schema_name: string
    :param paths: S3 paths to process.
    :type paths: list
    :returns: Spark DataFrame.
    :rtype: DataFrame
    """
    drop_subset = [
        "dut_type", "dut_version",
        "passed",
        "test_name_long", "test_name_short",
        "test_type",
        "version"
    ]

    # load schemas
    with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema:
        schema = StructType.fromJson(load(f_schema))

    # create empty DF out of schemas
    sdf = spark.createDataFrame([], schema)

    # filter list
    filtered = [path for path in paths if schema_name in path]

    # select
    for path in filtered:
        print(path)

        sdf_loaded = spark \
            .read \
            .option("multiline", "true") \
            .schema(schema) \
            .json(path) \
            .withColumn("job", lit(path.split("/")[4])) \
            .withColumn("build", lit(path.split("/")[5]))
        sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True)

    # drop rows with all nulls and drop rows with null in critical frames
    sdf = sdf.na.drop(how="all")
    sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset)

    # flatten frame
    sdf = flatten_frame(sdf)

    return sdf


# create SparkContext and GlueContext
spark_context = SparkContext.getOrCreate()
spark_context.setLogLevel("WARN")
glue_context = GlueContext(spark_context)
spark = glue_context.spark_session

# files of interest
paths = wr.s3.list_objects(
    path=PATH,
    suffix=SUFFIX,
    last_modified_begin=LAST_MODIFIED_BEGIN,
    last_modified_end=LAST_MODIFIED_END,
    ignore_suffix=IGNORE_SUFFIX,
    ignore_empty=True
)

filtered_paths = [path for path in paths if "report-coverage-2402" in path]

out_sdf = process_json_to_dataframe("reconf", filtered_paths)
out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
    .withColumn("year", lit(datetime.now().year)) \
    .withColumn("month", lit(datetime.now().month)) \
    .withColumn("day", lit(datetime.now().day)) \
    .repartition(1)

try:
    wr.s3.to_parquet(
        df=out_sdf.toPandas(),
        path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
        dataset=True,
        partition_cols=["test_type", "year", "month", "day"],
        compression="snappy",
        use_threads=True,
        mode="overwrite_partitions",
        boto3_session=session.Session(
            aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
            aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
            region_name=environ["OUT_AWS_DEFAULT_REGION"]
        )
    )
except EmptyDataFrame:
    pass
gured length */ u32 configured_event_queue_length; /* * Config parameters */ /** Session ssvm segment configs*/ uword session_baseva; uword session_va_space_size; u32 evt_qs_segment_size; u8 evt_qs_use_memfd_seg; /** Session table size parameters */ u32 configured_v4_session_table_buckets; u32 configured_v4_session_table_memory; u32 configured_v4_halfopen_table_buckets; u32 configured_v4_halfopen_table_memory; u32 configured_v6_session_table_buckets; u32 configured_v6_session_table_memory; u32 configured_v6_halfopen_table_buckets; u32 configured_v6_halfopen_table_memory; /** Transport table (preallocation) size parameters */ u32 local_endpoints_table_memory; u32 local_endpoints_table_buckets; /** Preallocate session config parameter */ u32 preallocated_sessions; #if SESSION_DEBUG /** * last event poll time by thread * Debug only. Will cause false cache-line sharing as-is */ f64 *last_event_poll_by_thread; #endif }; typedef struct session_dgram_pre_hdr_ { u32 data_length; u32 data_offset; } session_dgram_pre_hdr_t; /* *INDENT-OFF* */ typedef CLIB_PACKED (struct session_dgram_header_ { u32 data_length; u32 data_offset; ip46_address_t rmt_ip; ip46_address_t lcl_ip; u16 rmt_port; u16 lcl_port; u8 is_ip4; }) session_dgram_hdr_t; /* *INDENT-ON* */ #define SESSION_CONN_ID_LEN 37 #define SESSION_CONN_HDR_LEN 45 STATIC_ASSERT (sizeof (session_dgram_hdr_t) == (SESSION_CONN_ID_LEN + 8), "session conn id wrong length"); extern session_manager_main_t session_manager_main; extern vlib_node_registration_t session_queue_node; /* * Session manager function */ always_inline session_manager_main_t * vnet_get_session_manager_main () { return &session_manager_main; } always_inline u8 stream_session_is_valid (u32 si, u8 thread_index) { stream_session_t *s; s = pool_elt_at_index (session_manager_main.sessions[thread_index], si); if (s->thread_index != thread_index || s->session_index != si /* || s->server_rx_fifo->master_session_index != si || s->server_tx_fifo->master_session_index != si || s->server_rx_fifo->master_thread_index != thread_index || s->server_tx_fifo->master_thread_index != thread_index */ ) return 0; return 1; } stream_session_t *session_alloc (u32 thread_index); int session_alloc_fifos (segment_manager_t * sm, stream_session_t * s); void session_free (stream_session_t * s); always_inline stream_session_t * session_get (u32 si, u32 thread_index) { ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } always_inline stream_session_t * session_get_if_valid (u64 si, u32 thread_index) { if (thread_index >= vec_len (session_manager_main.sessions)) return 0; if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) return 0; ASSERT (stream_session_is_valid (si, thread_index)); return pool_elt_at_index (session_manager_main.sessions[thread_index], si); } always_inline session_handle_t session_handle (stream_session_t * s) { return ((u64) s->thread_index << 32) | (u64) s->session_index; } always_inline u32 session_index_from_handle (session_handle_t handle) { return handle & 0xFFFFFFFF; } always_inline u32 session_thread_from_handle (session_handle_t handle) { return handle >> 32; } always_inline void session_parse_handle (session_handle_t handle, u32 * index, u32 * thread_index) { *index = session_index_from_handle (handle); *thread_index = session_thread_from_handle (handle); } always_inline stream_session_t * session_get_from_handle (session_handle_t handle) { session_manager_main_t *smm = &session_manager_main; u32 session_index, thread_index; session_parse_handle (handle, &session_index, &thread_index); return pool_elt_at_index (smm->sessions[thread_index], session_index); } always_inline stream_session_t * session_get_from_handle_if_valid (session_handle_t handle) { u32 session_index, thread_index; session_parse_handle (handle, &session_index, &thread_index); return session_get_if_valid (session_index, thread_index); } always_inline u8 session_handle_is_local (session_handle_t handle) { if ((handle >> 32) == SESSION_LOCAL_HANDLE_PREFIX) return 1; return 0; } always_inline transport_proto_t session_type_transport_proto (session_type_t st) { return (st >> 1); } always_inline u8 session_type_is_ip4 (session_type_t st) { return (st & 1); } always_inline transport_proto_t session_get_transport_proto (stream_session_t * s) { return (s->session_type >> 1); } always_inline fib_protocol_t session_get_fib_proto (stream_session_t * s) { u8 is_ip4 = s->session_type & 1; return (is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6); } always_inline session_type_t session_type_from_proto_and_ip (transport_proto_t proto, u8 is_ip4) { return (proto << 1 | is_ip4); } always_inline u8 session_has_transport (stream_session_t * s) { return (session_get_transport_proto (s) != TRANSPORT_PROTO_NONE); } always_inline transport_service_type_t session_transport_service_type (stream_session_t * s) { transport_proto_t tp; tp = session_get_transport_proto (s); return transport_protocol_service_type (tp); } /** * Acquires a lock that blocks a session pool from expanding. * * This is typically used for safely peeking into other threads' * pools in order to clone elements. Lock should be dropped as soon * as possible by calling @ref session_pool_remove_peeker. * * NOTE: Avoid using pool_elt_at_index while the lock is held because * it may lead to free elt bitmap expansion/contraction! */ always_inline void session_pool_add_peeker (u32 thread_index) { session_manager_main_t *smm = &session_manager_main; if (thread_index == vlib_get_thread_index ()) return; clib_rwlock_reader_lock (&smm->peekers_rw_locks[thread_index]); } always_inline void session_pool_remove_peeker (u32 thread_index) { session_manager_main_t *smm = &session_manager_main; if (thread_index == vlib_get_thread_index ()) return; clib_rwlock_reader_unlock (&smm->peekers_rw_locks[thread_index]); } /** * Get session from handle and 'lock' pool resize if not in same thread * * Caller should drop the peek 'lock' as soon as possible. */ always_inline stream_session_t * session_get_from_handle_safe (u64 handle) { session_manager_main_t *smm = &session_manager_main; u32 thread_index = session_thread_from_handle (handle); if (thread_index == vlib_get_thread_index ()) { return pool_elt_at_index (smm->sessions[thread_index], session_index_from_handle (handle)); } else { session_pool_add_peeker (thread_index); /* Don't use pool_elt_at index. See @ref session_pool_add_peeker */ return smm->sessions[thread_index] + session_index_from_handle (handle); } } always_inline u32 stream_session_max_rx_enqueue (transport_connection_t * tc) { stream_session_t *s = session_get (tc->s_index, tc->thread_index); return svm_fifo_max_enqueue (s->server_rx_fifo); } always_inline u32 stream_session_rx_fifo_size (transport_connection_t * tc) { stream_session_t *s = session_get (tc->s_index, tc->thread_index); return s->server_rx_fifo->nitems; } always_inline u32 session_get_index (stream_session_t * s) { return (s - session_manager_main.sessions[s->thread_index]); } always_inline stream_session_t * session_clone_safe (u32 session_index, u32 thread_index) { stream_session_t *old_s, *new_s; u32 current_thread_index = vlib_get_thread_index (); /* If during the memcpy pool is reallocated AND the memory allocator * decides to give the old chunk of memory to somebody in a hurry to * scribble something on it, we have a problem. So add this thread as * a session pool peeker. */ session_pool_add_peeker (thread_index); new_s = session_alloc (current_thread_index); old_s = session_manager_main.sessions[thread_index] + session_index; clib_memcpy (new_s, old_s, sizeof (*new_s)); session_pool_remove_peeker (thread_index); new_s->thread_index = current_thread_index; new_s->session_index = session_get_index (new_s); return new_s; } transport_connection_t *session_get_transport (stream_session_t * s); u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc); int session_enqueue_stream_connection (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order); int session_enqueue_dgram_connection (stream_session_t * s, session_dgram_hdr_t * hdr, vlib_buffer_t * b, u8 proto, u8 queue_event); int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); int session_stream_connect_notify (transport_connection_t * tc, u8 is_fail); int session_dgram_connect_notify (transport_connection_t * tc, u32 old_thread_index, stream_session_t ** new_session); void stream_session_init_fifos_pointers (transport_connection_t * tc, u32 rx_pointer, u32 tx_pointer); void stream_session_accept_notify (transport_connection_t * tc); void stream_session_disconnect_notify (transport_connection_t * tc); void stream_session_delete_notify (transport_connection_t * tc); void stream_session_reset_notify (transport_connection_t * tc); int stream_session_accept (transport_connection_t * tc, u32 listener_index, u8 notify); int session_open (u32 app_index, session_endpoint_t * tep, u32 opaque); int stream_session_listen (stream_session_t * s, session_endpoint_t * tep); int stream_session_stop_listen (stream_session_t * s); void stream_session_disconnect (stream_session_t * s); void stream_session_disconnect_transport (stream_session_t * s); void stream_session_cleanup (stream_session_t * s); void session_send_session_evt_to_thread (u64 session_handle, fifo_event_type_t evt_type, u32 thread_index); ssvm_private_t *session_manager_get_evt_q_segment (void); u8 *format_stream_session (u8 * s, va_list * args); uword unformat_stream_session (unformat_input_t * input, va_list * args); uword unformat_transport_connection (unformat_input_t * input, va_list * args); void session_register_transport (transport_proto_t transport_proto, const transport_proto_vft_t * vft, u8 is_ip4, u32 output_node); clib_error_t *vnet_session_enable_disable (vlib_main_t * vm, u8 is_en); always_inline svm_queue_t * session_manager_get_vpp_event_queue (u32 thread_index) { return session_manager_main.vpp_event_queues[thread_index]; } int session_manager_flush_enqueue_events (u8 proto, u32 thread_index); int session_manager_flush_all_enqueue_events (u8 transport_proto); always_inline u64 listen_session_get_handle (stream_session_t * s) { ASSERT (s->session_state == SESSION_STATE_LISTENING); return session_handle (s); } always_inline stream_session_t * listen_session_get_from_handle (session_handle_t handle) { return session_get_from_handle (handle); } always_inline void listen_session_parse_handle (session_handle_t handle, u32 * index, u32 * thread_index) { session_parse_handle (handle, index, thread_index); } always_inline stream_session_t * listen_session_new (u8 thread_index, session_type_t type) { stream_session_t *s; s = session_alloc (thread_index); s->session_type = type; s->session_state = SESSION_STATE_LISTENING; return s; } always_inline stream_session_t * listen_session_get (u32 index) { return session_get (index, 0); } always_inline void listen_session_del (stream_session_t * s) { session_free (s); } transport_connection_t *listen_session_get_transport (stream_session_t * s); int listen_session_get_local_session_endpoint (stream_session_t * listener, session_endpoint_t * sep); always_inline u8 session_manager_is_enabled () { return session_manager_main.is_enabled == 1; } #define session_cli_return_if_not_enabled() \ do { \ if (!session_manager_main.is_enabled) \ return clib_error_return(0, "session layer is not enabled"); \ } while (0) void session_node_enable_disable (u8 is_en); #endif /* __included_session_h__ */ /* * fd.io coding-style-patch-verification: ON * * Local Variables: * eval: (c-set-style "gnu") * End: */