aboutsummaryrefslogtreecommitdiffstats
path: root/csit.infra.etl
diff options
context:
space:
mode:
authorpmikus <peter.mikus@protonmail.ch>2024-05-22 14:30:49 +0200
committerPeter Mikus <peter.mikus@protonmail.ch>2024-05-22 12:56:34 +0000
commit2ebf9e56d0dd200fa09979505a2da070b39da63f (patch)
tree6e6ca12c8458faa1c6559bf7afd1bbfc8b9abdd6 /csit.infra.etl
parente07a1a535c97e8b7e26e78e9ec3d8c6593407f70 (diff)
feat(etl): Release pipelines
Signed-off-by: pmikus <peter.mikus@protonmail.ch> Change-Id: I4ce20267b4747bf1901b6175e0ec5936b583a510
Diffstat (limited to 'csit.infra.etl')
-rw-r--r--csit.infra.etl/coverage_device_rls2406.py (renamed from csit.infra.etl/coverage_device_rls2402.py)26
-rw-r--r--csit.infra.etl/coverage_hoststack_rls2406.py (renamed from csit.infra.etl/coverage_hoststack_rls2402.py)27
-rw-r--r--csit.infra.etl/coverage_mrr_rls2406.py (renamed from csit.infra.etl/coverage_mrr_rls2402.py)26
-rw-r--r--csit.infra.etl/coverage_ndrpdr_rls2406.py (renamed from csit.infra.etl/coverage_ndrpdr_rls2402.py)26
-rw-r--r--csit.infra.etl/coverage_reconf_rls2406.py (renamed from csit.infra.etl/coverage_reconf_rls2402.py)27
-rw-r--r--csit.infra.etl/coverage_soak_rls2406.py (renamed from csit.infra.etl/coverage_soak_rls2402.py)26
-rw-r--r--csit.infra.etl/iterative_hoststack_rls2406.py (renamed from csit.infra.etl/iterative_hoststack_rls2402.py)27
-rw-r--r--csit.infra.etl/iterative_mrr_rls2406.py (renamed from csit.infra.etl/iterative_mrr_rls2402.py)26
-rw-r--r--csit.infra.etl/iterative_ndrpdr_rls2406.py (renamed from csit.infra.etl/iterative_ndrpdr_rls2402.py)26
-rw-r--r--csit.infra.etl/iterative_reconf_rls2406.py (renamed from csit.infra.etl/iterative_reconf_rls2402.py)26
-rw-r--r--csit.infra.etl/iterative_soak_rls2406.py (renamed from csit.infra.etl/iterative_soak_rls2402.py)26
-rw-r--r--csit.infra.etl/stats.py22
-rw-r--r--csit.infra.etl/trending_hoststack.py23
-rw-r--r--csit.infra.etl/trending_mrr.py22
-rw-r--r--csit.infra.etl/trending_ndrpdr.py22
-rw-r--r--csit.infra.etl/trending_reconf.py22
-rw-r--r--csit.infra.etl/trending_soak.py23
17 files changed, 260 insertions, 163 deletions
diff --git a/csit.infra.etl/coverage_device_rls2402.py b/csit.infra.etl/coverage_device_rls2406.py
index 2db808164f..04f4135851 100644
--- a/csit.infra.etl/coverage_device_rls2402.py
+++ b/csit.infra.etl/coverage_device_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-vpp-device-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("device", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/coverage_hoststack_rls2402.py b/csit.infra.etl/coverage_hoststack_rls2406.py
index 27eb9e8cc6..75edd983b8 100644
--- a/csit.infra.etl/coverage_hoststack_rls2402.py
+++ b/csit.infra.etl/coverage_hoststack_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,10 +141,9 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("hoststack", filtered_paths)
-out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -153,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/coverage_mrr_rls2402.py b/csit.infra.etl/coverage_mrr_rls2406.py
index e68e4f0366..b84c077308 100644
--- a/csit.infra.etl/coverage_mrr_rls2402.py
+++ b/csit.infra.etl/coverage_mrr_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("mrr", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/coverage_ndrpdr_rls2402.py b/csit.infra.etl/coverage_ndrpdr_rls2406.py
index 730e3ea748..ee0f878833 100644
--- a/csit.infra.etl/coverage_ndrpdr_rls2402.py
+++ b/csit.infra.etl/coverage_ndrpdr_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/coverage_reconf_rls2402.py b/csit.infra.etl/coverage_reconf_rls2406.py
index dc1f647ff1..33dbac72d2 100644
--- a/csit.infra.etl/coverage_reconf_rls2402.py
+++ b/csit.infra.etl/coverage_reconf_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,10 +141,9 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("reconf", filtered_paths)
-out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -153,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/coverage_soak_rls2402.py b/csit.infra.etl/coverage_soak_rls2406.py
index 7d87afd952..3b13c16229 100644
--- a/csit.infra.etl/coverage_soak_rls2402.py
+++ b/csit.infra.etl/coverage_soak_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-coverage-2402" in path]
+filtered_paths = [path for path in paths if "report-coverage-2406" in path]
out_sdf = process_json_to_dataframe("soak", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/coverage_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/iterative_hoststack_rls2402.py b/csit.infra.etl/iterative_hoststack_rls2406.py
index 1c74126c47..ebeade5571 100644
--- a/csit.infra.etl/iterative_hoststack_rls2402.py
+++ b/csit.infra.etl/iterative_hoststack_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,10 +141,9 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-iterative-2402" in path]
+filtered_paths = [path for path in paths if "report-iterative-2406" in path]
out_sdf = process_json_to_dataframe("hoststack", filtered_paths)
-out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -153,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/iterative_mrr_rls2402.py b/csit.infra.etl/iterative_mrr_rls2406.py
index e779dbdc36..9abb3434dc 100644
--- a/csit.infra.etl/iterative_mrr_rls2402.py
+++ b/csit.infra.etl/iterative_mrr_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-iterative-2402" in path]
+filtered_paths = [path for path in paths if "report-iterative-2406" in path]
out_sdf = process_json_to_dataframe("mrr", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/iterative_ndrpdr_rls2402.py b/csit.infra.etl/iterative_ndrpdr_rls2406.py
index 9231176e10..21a6c46cd1 100644
--- a/csit.infra.etl/iterative_ndrpdr_rls2402.py
+++ b/csit.infra.etl/iterative_ndrpdr_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-iterative-2402" in path]
+filtered_paths = [path for path in paths if "report-iterative-2406" in path]
out_sdf = process_json_to_dataframe("ndrpdr", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/iterative_reconf_rls2402.py b/csit.infra.etl/iterative_reconf_rls2406.py
index 1beeb16d2c..e9b06812e3 100644
--- a/csit.infra.etl/iterative_reconf_rls2402.py
+++ b/csit.infra.etl/iterative_reconf_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-iterative-2402" in path]
+filtered_paths = [path for path in paths if "report-iterative-2406" in path]
out_sdf = process_json_to_dataframe("reconf", filtered_paths)
out_sdf.show(truncate=False)
@@ -153,19 +153,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/iterative_soak_rls2402.py b/csit.infra.etl/iterative_soak_rls2406.py
index 55c6eb494d..6b05e30308 100644
--- a/csit.infra.etl/iterative_soak_rls2402.py
+++ b/csit.infra.etl/iterative_soak_rls2406.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -141,7 +141,7 @@ paths = wr.s3.list_objects(
ignore_empty=True
)
-filtered_paths = [path for path in paths if "report-iterative-2402" in path]
+filtered_paths = [path for path in paths if "report-iterative-2406" in path]
out_sdf = process_json_to_dataframe("soak", filtered_paths)
out_sdf.printSchema()
@@ -152,19 +152,25 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
- path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2402",
+ path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/iterative_rls2406",
dataset=True,
partition_cols=["test_type", "year", "month", "day"],
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/stats.py b/csit.infra.etl/stats.py
index 5d44caa25d..08ce4a9d0d 100644
--- a/csit.infra.etl/stats.py
+++ b/csit.infra.etl/stats.py
@@ -28,8 +28,9 @@ from pyspark.context import SparkContext
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="suite.info.json.gz"
IGNORE_SUFFIX=[]
@@ -106,7 +107,6 @@ paths = wr.s3.list_objects(
for schema_name in ["sra"]:
out_sdf = process_json_to_dataframe(schema_name, paths)
- out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -115,6 +115,16 @@ for schema_name in ["sra"]:
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+ except KeyError:
+ boto3_session = session.Session()
+ )
+
+ try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/stats",
@@ -123,11 +133,7 @@ for schema_name in ["sra"]:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/trending_hoststack.py b/csit.infra.etl/trending_hoststack.py
index 85cab5a179..45cb5c9bf5 100644
--- a/csit.infra.etl/trending_hoststack.py
+++ b/csit.infra.etl/trending_hoststack.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -144,7 +144,6 @@ paths = wr.s3.list_objects(
filtered_paths = [path for path in paths if "daily" in path or "weekly" in path]
out_sdf = process_json_to_dataframe("hoststack", filtered_paths)
-out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -153,6 +152,16 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending",
@@ -161,11 +170,7 @@ try:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/trending_mrr.py b/csit.infra.etl/trending_mrr.py
index a00c5fb4e1..b42aacaf36 100644
--- a/csit.infra.etl/trending_mrr.py
+++ b/csit.infra.etl/trending_mrr.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -153,6 +153,16 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending",
@@ -161,11 +171,7 @@ try:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/trending_ndrpdr.py b/csit.infra.etl/trending_ndrpdr.py
index e35d27b0bf..96582f5928 100644
--- a/csit.infra.etl/trending_ndrpdr.py
+++ b/csit.infra.etl/trending_ndrpdr.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -153,6 +153,16 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending",
@@ -161,11 +171,7 @@ try:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/trending_reconf.py b/csit.infra.etl/trending_reconf.py
index 94e6199e89..08287a74cc 100644
--- a/csit.infra.etl/trending_reconf.py
+++ b/csit.infra.etl/trending_reconf.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -153,6 +153,16 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending",
@@ -161,11 +171,7 @@ try:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass
diff --git a/csit.infra.etl/trending_soak.py b/csit.infra.etl/trending_soak.py
index 40da521884..e6faf5be34 100644
--- a/csit.infra.etl/trending_soak.py
+++ b/csit.infra.etl/trending_soak.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
-# Copyright (c) 2023 Cisco and/or its affiliates.
+# Copyright (c) 2024 Cisco and/or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
@@ -29,8 +29,8 @@ from pyspark.sql.functions import col, lit, regexp_replace
from pyspark.sql.types import StructType
-S3_LOGS_BUCKET="fdio-logs-s3-cloudfront-index"
-S3_DOCS_BUCKET="fdio-docs-s3-cloudfront-index"
+S3_LOGS_BUCKET=environ.get("S3_LOGS_BUCKET", "fdio-logs-s3-cloudfront-index")
+S3_DOCS_BUCKET=environ.get("S3_DOCS_BUCKET", "fdio-docs-s3-cloudfront-index")
PATH=f"s3://{S3_LOGS_BUCKET}/vex-yul-rot-jenkins-1/csit-*-perf-*"
SUFFIX="info.json.gz"
IGNORE_SUFFIX=[
@@ -144,7 +144,6 @@ paths = wr.s3.list_objects(
filtered_paths = [path for path in paths if "daily" in path or "weekly" in path]
out_sdf = process_json_to_dataframe("soak", filtered_paths)
-out_sdf.show(truncate=False)
out_sdf.printSchema()
out_sdf = out_sdf \
.withColumn("year", lit(datetime.now().year)) \
@@ -153,6 +152,16 @@ out_sdf = out_sdf \
.repartition(1)
try:
+ boto3_session = session.Session(
+ aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
+ aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
+ region_name=environ["OUT_AWS_DEFAULT_REGION"]
+ )
+except KeyError:
+ boto3_session = session.Session()
+)
+
+try:
wr.s3.to_parquet(
df=out_sdf.toPandas(),
path=f"s3://{S3_DOCS_BUCKET}/csit/parquet/trending",
@@ -161,11 +170,7 @@ try:
compression="snappy",
use_threads=True,
mode="overwrite_partitions",
- boto3_session=session.Session(
- aws_access_key_id=environ["OUT_AWS_ACCESS_KEY_ID"],
- aws_secret_access_key=environ["OUT_AWS_SECRET_ACCESS_KEY"],
- region_name=environ["OUT_AWS_DEFAULT_REGION"]
- )
+ boto3_session=boto3_session
)
except EmptyDataFrame:
pass