From 8e131d171691ea86bd3f93cd9cae963d7e9d6f08 Mon Sep 17 00:00:00 2001 From: pmikus Date: Thu, 12 Jan 2023 13:17:50 +0000 Subject: feat(etl): rls2302 + hoststack Signed-off-by: pmikus Change-Id: I511265b8d38793f74d23f73e434fa241d969b560 --- csit.infra.etl/coverage_device_rls2210.py | 170 --------------- csit.infra.etl/coverage_device_rls2302.py | 170 +++++++++++++++ csit.infra.etl/coverage_hoststack.json | 223 ++++++++++++++++++++ csit.infra.etl/coverage_hoststack_rls2302.py | 171 ++++++++++++++++ csit.infra.etl/coverage_mrr_rls2210.py | 170 --------------- csit.infra.etl/coverage_mrr_rls2302.py | 170 +++++++++++++++ csit.infra.etl/coverage_ndrpdr_rls2210.py | 170 --------------- csit.infra.etl/coverage_ndrpdr_rls2302.py | 170 +++++++++++++++ csit.infra.etl/coverage_reconf.json | 223 ++++++++++++++++++++ csit.infra.etl/coverage_reconf_rls2210.json | 223 -------------------- csit.infra.etl/coverage_reconf_rls2210.py | 171 ---------------- csit.infra.etl/coverage_reconf_rls2302.py | 171 ++++++++++++++++ csit.infra.etl/coverage_soak_rls2210.py | 170 --------------- csit.infra.etl/coverage_soak_rls2302.py | 170 +++++++++++++++ csit.infra.etl/iterative_hoststack.json | 285 ++++++++++++++++++++++++++ csit.infra.etl/iterative_hoststack_rls2302.py | 171 ++++++++++++++++ csit.infra.etl/iterative_mrr_rls2210.py | 170 --------------- csit.infra.etl/iterative_mrr_rls2302.py | 170 +++++++++++++++ csit.infra.etl/iterative_ndrpdr_rls2210.py | 170 --------------- csit.infra.etl/iterative_ndrpdr_rls2302.py | 170 +++++++++++++++ csit.infra.etl/iterative_reconf.json | 223 ++++++++++++++++++++ csit.infra.etl/iterative_reconf_rls2210.json | 223 -------------------- csit.infra.etl/iterative_reconf_rls2210.py | 171 ---------------- csit.infra.etl/iterative_reconf_rls2302.py | 171 ++++++++++++++++ csit.infra.etl/iterative_soak_rls2210.py | 170 --------------- csit.infra.etl/iterative_soak_rls2302.py | 170 +++++++++++++++ csit.infra.etl/local.py | 2 +- csit.infra.etl/stats.py | 2 +- csit.infra.etl/trending_hoststack.json | 285 ++++++++++++++++++++++++++ csit.infra.etl/trending_hoststack.py | 171 ++++++++++++++++ csit.infra.etl/trending_mrr.py | 2 +- csit.infra.etl/trending_ndrpdr.py | 2 +- csit.infra.etl/trending_reconf.py | 2 +- csit.infra.etl/trending_soak.py | 2 +- 34 files changed, 3290 insertions(+), 1984 deletions(-) delete mode 100644 csit.infra.etl/coverage_device_rls2210.py create mode 100644 csit.infra.etl/coverage_device_rls2302.py create mode 100644 csit.infra.etl/coverage_hoststack.json create mode 100644 csit.infra.etl/coverage_hoststack_rls2302.py delete mode 100644 csit.infra.etl/coverage_mrr_rls2210.py create mode 100644 csit.infra.etl/coverage_mrr_rls2302.py delete mode 100644 csit.infra.etl/coverage_ndrpdr_rls2210.py create mode 100644 csit.infra.etl/coverage_ndrpdr_rls2302.py create mode 100644 csit.infra.etl/coverage_reconf.json delete mode 100644 csit.infra.etl/coverage_reconf_rls2210.json delete mode 100644 csit.infra.etl/coverage_reconf_rls2210.py create mode 100644 csit.infra.etl/coverage_reconf_rls2302.py delete mode 100644 csit.infra.etl/coverage_soak_rls2210.py create mode 100644 csit.infra.etl/coverage_soak_rls2302.py create mode 100644 csit.infra.etl/iterative_hoststack.json create mode 100644 csit.infra.etl/iterative_hoststack_rls2302.py delete mode 100644 csit.infra.etl/iterative_mrr_rls2210.py create mode 100644 csit.infra.etl/iterative_mrr_rls2302.py delete mode 100644 csit.infra.etl/iterative_ndrpdr_rls2210.py create mode 100644 csit.infra.etl/iterative_ndrpdr_rls2302.py create mode 100644 csit.infra.etl/iterative_reconf.json delete mode 100644 csit.infra.etl/iterative_reconf_rls2210.json delete mode 100644 csit.infra.etl/iterative_reconf_rls2210.py create mode 100644 csit.infra.etl/iterative_reconf_rls2302.py delete mode 100644 csit.infra.etl/iterative_soak_rls2210.py create mode 100644 csit.infra.etl/iterative_soak_rls2302.py create mode 100644 csit.infra.etl/trending_hoststack.json create mode 100644 csit.infra.etl/trending_hoststack.py (limited to 'csit.infra.etl') diff --git a/csit.infra.etl/coverage_device_rls2210.py b/csit.infra.etl/coverage_device_rls2210.py deleted file mode 100644 index 0822d13a9a..0000000000 --- a/csit.infra.etl/coverage_device_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-coverage-2210" in path] - -out_sdf = process_json_to_dataframe("device", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/coverage_device_rls2302.py b/csit.infra.etl/coverage_device_rls2302.py new file mode 100644 index 0000000000..26c49d9b71 --- /dev/null +++ b/csit.infra.etl/coverage_device_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("device", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/coverage_hoststack.json b/csit.infra.etl/coverage_hoststack.json new file mode 100644 index 0000000000..fdd6eab6c0 --- /dev/null +++ b/csit.infra.etl/coverage_hoststack.json @@ -0,0 +1,223 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "job", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "build", + "nullable": false, + "type": "integer" + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dut_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dut_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "result", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "aggregate_rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "bandwidth", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "loss", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "packets", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "integer" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "time", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "start_time", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passed", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "test_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_long", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_short", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tags", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "version", + "nullable": true, + "type": "string" + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/csit.infra.etl/coverage_hoststack_rls2302.py b/csit.infra.etl/coverage_hoststack_rls2302.py new file mode 100644 index 0000000000..4ec286f679 --- /dev/null +++ b/csit.infra.etl/coverage_hoststack_rls2302.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("hoststack", filtered_paths) +out_sdf.show(truncate=False) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/coverage_mrr_rls2210.py b/csit.infra.etl/coverage_mrr_rls2210.py deleted file mode 100644 index 9c9e1c9603..0000000000 --- a/csit.infra.etl/coverage_mrr_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-coverage-2210" in path] - -out_sdf = process_json_to_dataframe("mrr", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/coverage_mrr_rls2302.py b/csit.infra.etl/coverage_mrr_rls2302.py new file mode 100644 index 0000000000..7ed4c93e79 --- /dev/null +++ b/csit.infra.etl/coverage_mrr_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("mrr", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/coverage_ndrpdr_rls2210.py b/csit.infra.etl/coverage_ndrpdr_rls2210.py deleted file mode 100644 index c0bd1d3c19..0000000000 --- a/csit.infra.etl/coverage_ndrpdr_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-coverage-2210" in path] - -out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/coverage_ndrpdr_rls2302.py b/csit.infra.etl/coverage_ndrpdr_rls2302.py new file mode 100644 index 0000000000..fd0f496b12 --- /dev/null +++ b/csit.infra.etl/coverage_ndrpdr_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/coverage_reconf.json b/csit.infra.etl/coverage_reconf.json new file mode 100644 index 0000000000..fdd6eab6c0 --- /dev/null +++ b/csit.infra.etl/coverage_reconf.json @@ -0,0 +1,223 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "job", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "build", + "nullable": false, + "type": "integer" + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dut_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dut_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "result", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "aggregate_rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "bandwidth", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "loss", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "packets", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "integer" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "time", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "start_time", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passed", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "test_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_long", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_short", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tags", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "version", + "nullable": true, + "type": "string" + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/csit.infra.etl/coverage_reconf_rls2210.json b/csit.infra.etl/coverage_reconf_rls2210.json deleted file mode 100644 index fdd6eab6c0..0000000000 --- a/csit.infra.etl/coverage_reconf_rls2210.json +++ /dev/null @@ -1,223 +0,0 @@ -{ - "fields": [ - { - "metadata": {}, - "name": "job", - "nullable": false, - "type": "string" - }, - { - "metadata": {}, - "name": "build", - "nullable": false, - "type": "integer" - }, - { - "metadata": {}, - "name": "duration", - "nullable": true, - "type": "double" - }, - { - "metadata": {}, - "name": "dut_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "dut_version", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tg_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tg_version", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "result", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "aggregate_rate", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "bandwidth", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "rate", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "loss", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "packets", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "integer" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "time", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "type", - "nullable": true, - "type": "string" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "start_time", - "nullable": true, - "type": "timestamp" - }, - { - "metadata": {}, - "name": "passed", - "nullable": true, - "type": "boolean" - }, - { - "metadata": {}, - "name": "test_id", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_name_long", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_name_short", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tags", - "nullable": true, - "type": { - "containsNull": true, - "elementType": "string", - "type": "array" - } - }, - { - "metadata": {}, - "name": "version", - "nullable": true, - "type": "string" - } - ], - "type": "struct" -} \ No newline at end of file diff --git a/csit.infra.etl/coverage_reconf_rls2210.py b/csit.infra.etl/coverage_reconf_rls2210.py deleted file mode 100644 index b2a540d9e4..0000000000 --- a/csit.infra.etl/coverage_reconf_rls2210.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-coverage-2210" in path] - -out_sdf = process_json_to_dataframe("reconf", filtered_paths) -out_sdf.show(truncate=False) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/coverage_reconf_rls2302.py b/csit.infra.etl/coverage_reconf_rls2302.py new file mode 100644 index 0000000000..76e8c6f9ce --- /dev/null +++ b/csit.infra.etl/coverage_reconf_rls2302.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("reconf", filtered_paths) +out_sdf.show(truncate=False) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/coverage_soak_rls2210.py b/csit.infra.etl/coverage_soak_rls2210.py deleted file mode 100644 index 9ef9a36c68..0000000000 --- a/csit.infra.etl/coverage_soak_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-coverage-2210" in path] - -out_sdf = process_json_to_dataframe("soak", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/coverage_soak_rls2302.py b/csit.infra.etl/coverage_soak_rls2302.py new file mode 100644 index 0000000000..7cd171470d --- /dev/null +++ b/csit.infra.etl/coverage_soak_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"coverage_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-coverage-2302" in path] + +out_sdf = process_json_to_dataframe("soak", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/iterative_hoststack.json b/csit.infra.etl/iterative_hoststack.json new file mode 100644 index 0000000000..a3365cdba0 --- /dev/null +++ b/csit.infra.etl/iterative_hoststack.json @@ -0,0 +1,285 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "job", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "build", + "nullable": false, + "type": "integer" + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dut_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dut_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "hosts", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "tg_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "result", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "bandwidth", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "latency", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "failed_requests", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "completed_requests", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "retransmits", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "start_time", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passed", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "telemetry", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "test_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_long", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_short", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "message", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "version", + "nullable": true, + "type": "string" + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/csit.infra.etl/iterative_hoststack_rls2302.py b/csit.infra.etl/iterative_hoststack_rls2302.py new file mode 100644 index 0000000000..6ecd723f34 --- /dev/null +++ b/csit.infra.etl/iterative_hoststack_rls2302.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-iterative-2302" in path] + +out_sdf = process_json_to_dataframe("hoststack", filtered_paths) +out_sdf.show(truncate=False) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/iterative_mrr_rls2210.py b/csit.infra.etl/iterative_mrr_rls2210.py deleted file mode 100644 index b7a8dbcbfa..0000000000 --- a/csit.infra.etl/iterative_mrr_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-iterative-2210" in path] - -out_sdf = process_json_to_dataframe("mrr", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/iterative_mrr_rls2302.py b/csit.infra.etl/iterative_mrr_rls2302.py new file mode 100644 index 0000000000..6ee7971aee --- /dev/null +++ b/csit.infra.etl/iterative_mrr_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-iterative-2302" in path] + +out_sdf = process_json_to_dataframe("mrr", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/iterative_ndrpdr_rls2210.py b/csit.infra.etl/iterative_ndrpdr_rls2210.py deleted file mode 100644 index 70ab8158a8..0000000000 --- a/csit.infra.etl/iterative_ndrpdr_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-iterative-2210" in path] - -out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/iterative_ndrpdr_rls2302.py b/csit.infra.etl/iterative_ndrpdr_rls2302.py new file mode 100644 index 0000000000..81e6f489c2 --- /dev/null +++ b/csit.infra.etl/iterative_ndrpdr_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-iterative-2302" in path] + +out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/iterative_reconf.json b/csit.infra.etl/iterative_reconf.json new file mode 100644 index 0000000000..fdd6eab6c0 --- /dev/null +++ b/csit.infra.etl/iterative_reconf.json @@ -0,0 +1,223 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "job", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "build", + "nullable": false, + "type": "integer" + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dut_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dut_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "result", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "aggregate_rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "bandwidth", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "loss", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "packets", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "integer" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "time", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "start_time", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passed", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "test_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_long", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_short", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tags", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "version", + "nullable": true, + "type": "string" + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/csit.infra.etl/iterative_reconf_rls2210.json b/csit.infra.etl/iterative_reconf_rls2210.json deleted file mode 100644 index fdd6eab6c0..0000000000 --- a/csit.infra.etl/iterative_reconf_rls2210.json +++ /dev/null @@ -1,223 +0,0 @@ -{ - "fields": [ - { - "metadata": {}, - "name": "job", - "nullable": false, - "type": "string" - }, - { - "metadata": {}, - "name": "build", - "nullable": false, - "type": "integer" - }, - { - "metadata": {}, - "name": "duration", - "nullable": true, - "type": "double" - }, - { - "metadata": {}, - "name": "dut_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "dut_version", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tg_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tg_version", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "result", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "aggregate_rate", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "bandwidth", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "rate", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "loss", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "packets", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "integer" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "time", - "nullable": true, - "type": { - "fields": [ - { - "metadata": {}, - "name": "unit", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "value", - "nullable": true, - "type": "double" - } - ], - "type": "struct" - } - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "type", - "nullable": true, - "type": "string" - } - ], - "type": "struct" - } - }, - { - "metadata": {}, - "name": "start_time", - "nullable": true, - "type": "timestamp" - }, - { - "metadata": {}, - "name": "passed", - "nullable": true, - "type": "boolean" - }, - { - "metadata": {}, - "name": "test_id", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_name_long", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_name_short", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "test_type", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "tags", - "nullable": true, - "type": { - "containsNull": true, - "elementType": "string", - "type": "array" - } - }, - { - "metadata": {}, - "name": "version", - "nullable": true, - "type": "string" - } - ], - "type": "struct" -} \ No newline at end of file diff --git a/csit.infra.etl/iterative_reconf_rls2210.py b/csit.infra.etl/iterative_reconf_rls2210.py deleted file mode 100644 index 836cf81de4..0000000000 --- a/csit.infra.etl/iterative_reconf_rls2210.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-iterative-2210" in path] - -out_sdf = process_json_to_dataframe("reconf", filtered_paths) -out_sdf.show(truncate=False) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/iterative_reconf_rls2302.py b/csit.infra.etl/iterative_reconf_rls2302.py new file mode 100644 index 0000000000..47f27a22a1 --- /dev/null +++ b/csit.infra.etl/iterative_reconf_rls2302.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-iterative-2302" in path] + +out_sdf = process_json_to_dataframe("reconf", filtered_paths) +out_sdf.show(truncate=False) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/iterative_soak_rls2210.py b/csit.infra.etl/iterative_soak_rls2210.py deleted file mode 100644 index b74d7b44dc..0000000000 --- a/csit.infra.etl/iterative_soak_rls2210.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2022 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ETL script running on top of the s3://""" - -from datetime import datetime, timedelta -from json import load -from os import environ -from pytz import utc - -import awswrangler as wr -from awswrangler.exceptions import EmptyDataFrame -from awsglue.context import GlueContext -from boto3 import session -from pyspark.context import SparkContext -from pyspark.sql.functions import col, lit, regexp_replace -from pyspark.sql.types import StructType - - -S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" -S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" -PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" -SUFFIX="info.json.gz" -IGNORE_SUFFIX=[ - "suite.info.json.gz", - "setup.info.json.gz", - "teardown.info.json.gz", - "suite.output.info.json.gz", - "setup.output.info.json.gz", - "teardown.output.info.json.gz" -] -LAST_MODIFIED_END=utc.localize( - datetime.strptime( - f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", - "%Y-%m-%d" - ) -) -LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) - - -def flatten_frame(nested_sdf): - """Unnest Spark DataFrame in case there nested structered columns. - - :param nested_sdf: Spark DataFrame. - :type nested_sdf: DataFrame - :returns: Unnest DataFrame. - :rtype: DataFrame - """ - stack = [((), nested_sdf)] - columns = [] - while len(stack) > 0: - parents, sdf = stack.pop() - for column_name, column_type in sdf.dtypes: - if column_type[:6] == "struct": - projected_sdf = sdf.select(column_name + ".*") - stack.append((parents + (column_name,), projected_sdf)) - else: - columns.append( - col(".".join(parents + (column_name,))) \ - .alias("_".join(parents + (column_name,))) - ) - return nested_sdf.select(columns) - - -def process_json_to_dataframe(schema_name, paths): - """Processes JSON to Spark DataFrame. - - :param schema_name: Schema name. - :type schema_name: string - :param paths: S3 paths to process. - :type paths: list - :returns: Spark DataFrame. - :rtype: DataFrame - """ - drop_subset = [ - "dut_type", "dut_version", - "passed", - "test_name_long", "test_name_short", - "test_type", - "version" - ] - - # load schemas - with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: - schema = StructType.fromJson(load(f_schema)) - - # create empty DF out of schemas - sdf = spark.createDataFrame([], schema) - - # filter list - filtered = [path for path in paths if schema_name in path] - - # select - for path in filtered: - print(path) - - sdf_loaded = spark \ - .read \ - .option("multiline", "true") \ - .schema(schema) \ - .json(path) \ - .withColumn("job", lit(path.split("/")[4])) \ - .withColumn("build", lit(path.split("/")[5])) - sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) - - # drop rows with all nulls and drop rows with null in critical frames - sdf = sdf.na.drop(how="all") - sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) - - # flatten frame - sdf = flatten_frame(sdf) - - return sdf - - -# create SparkContext and GlueContext -spark_context = SparkContext.getOrCreate() -spark_context.setLogLevel("WARN") -glue_context = GlueContext(spark_context) -spark = glue_context.spark_session - -# files of interest -paths = wr.s3.list_objects( - path=PATH, - suffix=SUFFIX, - last_modified_begin=LAST_MODIFIED_BEGIN, - last_modified_end=LAST_MODIFIED_END, - ignore_suffix=IGNORE_SUFFIX, - ignore_empty=True -) - -filtered_paths = [path for path in paths if "report-iterative-2210" in path] - -out_sdf = process_json_to_dataframe("soak", filtered_paths) -out_sdf.printSchema() -out_sdf = out_sdf \ - .withColumn("year", lit(datetime.now().year)) \ - .withColumn("month", lit(datetime.now().month)) \ - .withColumn("day", lit(datetime.now().day)) \ - .repartition(1) - -try: - wr.s3.to_parquet( - df=out_sdf.toPandas(), - path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2210", - dataset=True, - partition_cols=["test_type", "year", "month", "day"], - compression="snappy", - use_threads=True, - mode="overwrite_partitions", - boto3_session=session.Session( - aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], - aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], - region_name=environ["OUT_AWS_DEFAULT_REGION"] - ) - ) -except EmptyDataFrame: - pass diff --git a/csit.infra.etl/iterative_soak_rls2302.py b/csit.infra.etl/iterative_soak_rls2302.py new file mode 100644 index 0000000000..ac24b6ba3a --- /dev/null +++ b/csit.infra.etl/iterative_soak_rls2302.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "report-iterative-2302" in path] + +out_sdf = process_json_to_dataframe("soak", filtered_paths) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2302", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/local.py b/csit.infra.etl/local.py index 79e18d1c64..e942cebbba 100644 --- a/csit.infra.etl/local.py +++ b/csit.infra.etl/local.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: diff --git a/csit.infra.etl/stats.py b/csit.infra.etl/stats.py index ab8bcafdeb..5d44caa25d 100644 --- a/csit.infra.etl/stats.py +++ b/csit.infra.etl/stats.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: diff --git a/csit.infra.etl/trending_hoststack.json b/csit.infra.etl/trending_hoststack.json new file mode 100644 index 0000000000..a3365cdba0 --- /dev/null +++ b/csit.infra.etl/trending_hoststack.json @@ -0,0 +1,285 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "job", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "build", + "nullable": false, + "type": "integer" + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dut_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dut_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "hosts", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "tg_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "tg_version", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "result", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "bandwidth", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "rate", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "latency", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "failed_requests", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "completed_requests", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "retransmits", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "duration", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "unit", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "value", + "nullable": true, + "type": "double" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "start_time", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passed", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "telemetry", + "nullable": true, + "type": { + "containsNull": true, + "elementType": "string", + "type": "array" + } + }, + { + "metadata": {}, + "name": "test_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_long", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_name_short", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "test_type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "message", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "version", + "nullable": true, + "type": "string" + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/csit.infra.etl/trending_hoststack.py b/csit.infra.etl/trending_hoststack.py new file mode 100644 index 0000000000..85cab5a179 --- /dev/null +++ b/csit.infra.etl/trending_hoststack.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2023 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ETL script running on top of the s3://""" + +from datetime import datetime, timedelta +from json import load +from os import environ +from pytz import utc + +import awswrangler as wr +from awswrangler.exceptions import EmptyDataFrame +from awsglue.context import GlueContext +from boto3 import session +from pyspark.context import SparkContext +from pyspark.sql.functions import col, lit, regexp_replace +from pyspark.sql.types import StructType + + +S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index" +S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index" +PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*" +SUFFIX="info.json.gz" +IGNORE_SUFFIX=[ + "suite.info.json.gz", + "setup.info.json.gz", + "teardown.info.json.gz", + "suite.output.info.json.gz", + "setup.output.info.json.gz", + "teardown.output.info.json.gz" +] +LAST_MODIFIED_END=utc.localize( + datetime.strptime( + f"{datetime.now().year}-{datetime.now().month}-{datetime.now().day}", + "%Y-%m-%d" + ) +) +LAST_MODIFIED_BEGIN=LAST_MODIFIED_END - timedelta(1) + + +def flatten_frame(nested_sdf): + """Unnest Spark DataFrame in case there nested structered columns. + + :param nested_sdf: Spark DataFrame. + :type nested_sdf: DataFrame + :returns: Unnest DataFrame. + :rtype: DataFrame + """ + stack = [((), nested_sdf)] + columns = [] + while len(stack) > 0: + parents, sdf = stack.pop() + for column_name, column_type in sdf.dtypes: + if column_type[:6] == "struct": + projected_sdf = sdf.select(column_name + ".*") + stack.append((parents + (column_name,), projected_sdf)) + else: + columns.append( + col(".".join(parents + (column_name,))) \ + .alias("_".join(parents + (column_name,))) + ) + return nested_sdf.select(columns) + + +def process_json_to_dataframe(schema_name, paths): + """Processes JSON to Spark DataFrame. + + :param schema_name: Schema name. + :type schema_name: string + :param paths: S3 paths to process. + :type paths: list + :returns: Spark DataFrame. + :rtype: DataFrame + """ + drop_subset = [ + "dut_type", "dut_version", + "passed", + "test_name_long", "test_name_short", + "test_type", + "version" + ] + + # load schemas + with open(f"trending_{schema_name}.json", "r", encoding="UTF-8") as f_schema: + schema = StructType.fromJson(load(f_schema)) + + # create empty DF out of schemas + sdf = spark.createDataFrame([], schema) + + # filter list + filtered = [path for path in paths if schema_name in path] + + # select + for path in filtered: + print(path) + + sdf_loaded = spark \ + .read \ + .option("multiline", "true") \ + .schema(schema) \ + .json(path) \ + .withColumn("job", lit(path.split("/")[4])) \ + .withColumn("build", lit(path.split("/")[5])) + sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) + + # drop rows with all nulls and drop rows with null in critical frames + sdf = sdf.na.drop(how="all") + sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) + + # flatten frame + sdf = flatten_frame(sdf) + + return sdf + + +# create SparkContext and GlueContext +spark_context = SparkContext.getOrCreate() +spark_context.setLogLevel("WARN") +glue_context = GlueContext(spark_context) +spark = glue_context.spark_session + +# files of interest +paths = wr.s3.list_objects( + path=PATH, + suffix=SUFFIX, + last_modified_begin=LAST_MODIFIED_BEGIN, + last_modified_end=LAST_MODIFIED_END, + ignore_suffix=IGNORE_SUFFIX, + ignore_empty=True +) + +filtered_paths = [path for path in paths if "daily" in path or "weekly" in path] + +out_sdf = process_json_to_dataframe("hoststack", filtered_paths) +out_sdf.show(truncate=False) +out_sdf.printSchema() +out_sdf = out_sdf \ + .withColumn("year", lit(datetime.now().year)) \ + .withColumn("month", lit(datetime.now().month)) \ + .withColumn("day", lit(datetime.now().day)) \ + .repartition(1) + +try: + wr.s3.to_parquet( + df=out_sdf.toPandas(), + path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending", + dataset=True, + partition_cols=["test_type", "year", "month", "day"], + compression="snappy", + use_threads=True, + mode="overwrite_partitions", + boto3_session=session.Session( + aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"], + region_name=environ["OUT_AWS_DEFAULT_REGION"] + ) + ) +except EmptyDataFrame: + pass diff --git a/csit.infra.etl/trending_mrr.py b/csit.infra.etl/trending_mrr.py index 1ba8c69b1b..a00c5fb4e1 100644 --- a/csit.infra.etl/trending_mrr.py +++ b/csit.infra.etl/trending_mrr.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: diff --git a/csit.infra.etl/trending_ndrpdr.py b/csit.infra.etl/trending_ndrpdr.py index d3c51ba757..e35d27b0bf 100644 --- a/csit.infra.etl/trending_ndrpdr.py +++ b/csit.infra.etl/trending_ndrpdr.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: diff --git a/csit.infra.etl/trending_reconf.py b/csit.infra.etl/trending_reconf.py index e30808c457..94e6199e89 100644 --- a/csit.infra.etl/trending_reconf.py +++ b/csit.infra.etl/trending_reconf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: diff --git a/csit.infra.etl/trending_soak.py b/csit.infra.etl/trending_soak.py index e54cf9f18a..40da521884 100644 --- a/csit.infra.etl/trending_soak.py +++ b/csit.infra.etl/trending_soak.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2022 Cisco and/or its affiliates. +# Copyright (c) 2023 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: -- cgit 1.2.3-korg