aboutsummaryrefslogtreecommitdiffstats
path: root/fdio.infra.terraform/1n_nmd/etl
diff options
context:
space:
mode:
authorPeter Mikus <pmikus@cisco.com>2022-02-22 11:00:47 +0100
committerPeter Mikus <pmikus@cisco.com>2022-02-24 08:01:53 +0000
commit54fd337e30acc97434b33a6d0d3c19e4aa3051ab (patch)
tree5d131a1d7c0846a209e85380b66dfc7770cbe32e /fdio.infra.terraform/1n_nmd/etl
parentb2cb835b34c7404b2aaee3ec30700c67537da66d (diff)
feat(uti): etl
Signed-off-by: Peter Mikus <pmikus@cisco.com> Change-Id: I7cdcdcbf1e4986664d5d48357688185319f67b0c
Diffstat (limited to 'fdio.infra.terraform/1n_nmd/etl')
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl318
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/fdio/main.tf23
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf13
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf47
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf17
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/main.tf33
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/variables.tf115
-rw-r--r--fdio.infra.terraform/1n_nmd/etl/versions.tf9
8 files changed, 575 insertions, 0 deletions
diff --git a/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl b/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl
new file mode 100644
index 0000000000..c1d186f681
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl
@@ -0,0 +1,318 @@
+job "${job_name}" {
+ # The "datacenters" parameter specifies the list of datacenters which should
+ # be considered when placing this task. This must be provided.
+ datacenters = "${datacenters}"
+
+ # The "type" parameter controls the type of job, which impacts the scheduler's
+ # decision on placement. For a full list of job types and their differences,
+ # please see the online documentation.
+ #
+ # https://www.nomadproject.io/docs/jobspec/schedulers
+ #
+ type = "${type}"
+
+ # The periodic stanza allows a job to run at fixed times, dates, or intervals.
+ # The easiest way to think about the periodic scheduler is "Nomad cron" or
+ # "distributed cron".
+ #
+ # https://www.nomadproject.io/docs/job-specification/periodic
+ #
+ periodic {
+ cron = "${cron}"
+ prohibit_overlap = "${prohibit_overlap}"
+ time_zone = "${time_zone}"
+ }
+
+ # The "group" stanza defines a series of tasks that should be co-located on
+ # the same Nomad client. Any task within a group will be placed on the same
+ # client.
+ #
+ # https://www.nomadproject.io/docs/job-specification/group
+ #
+ group "${job_name}-master" {
+ # The restart stanza configures a tasks's behavior on task failure. Restarts
+ # happen on the client that is running the task.
+ #
+ # https://www.nomadproject.io/docs/job-specification/restart
+ #
+ restart {
+ mode = "fail"
+ }
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ constraint {
+ attribute = "$${node.class}"
+ value = "builder"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # https://www.nomadproject.io/docs/job-specification/task.html
+ #
+ task "${job_name}-trending" {
+ # The artifact stanza instructs Nomad to fetch and unpack a remote
+ # resource, such as a file, tarball, or binary.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "git::https://github.com/pmikus/glue-etl-pyspark.git"
+ destination = "local/etl"
+ }
+
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${image}"
+ command = "gluesparksubmit"
+ args = [
+ "--driver-memory", "20g",
+ "--executor-memory", "20g",
+ "trending.py"
+ ]
+ work_dir = "/local/etl"
+ }
+
+ # The env stanza configures a list of environment variables to populate
+ # the task's environment before starting.
+ env {
+ AWS_ACCESS_KEY_ID = "${aws_access_key_id}"
+ AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}"
+ AWS_DEFAULT_REGION = "${aws_default_region}"
+ OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}"
+ OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}"
+ OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}"
+ ${ envs }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${memory}
+ }
+ }
+ task "${job_name}-stats" {
+ # The artifact stanza instructs Nomad to fetch and unpack a remote
+ # resource, such as a file, tarball, or binary.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "git::https://github.com/pmikus/glue-etl-pyspark.git"
+ destination = "local/etl"
+ }
+
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${image}"
+ command = "gluesparksubmit"
+ args = [
+ "--driver-memory", "10g",
+ "--executor-memory", "10g",
+ "stats.py"
+ ]
+ work_dir = "/local/etl"
+ }
+
+ # The env stanza configures a list of environment variables to populate
+ # the task's environment before starting.
+ env {
+ AWS_ACCESS_KEY_ID = "${aws_access_key_id}"
+ AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}"
+ AWS_DEFAULT_REGION = "${aws_default_region}"
+ OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}"
+ OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}"
+ OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}"
+ ${ envs }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${memory}
+ }
+ }
+ }
+ group "${job_name}-rls2202" {
+ # The restart stanza configures a tasks's behavior on task failure. Restarts
+ # happen on the client that is running the task.
+ #
+ # https://www.nomadproject.io/docs/job-specification/restart
+ #
+ restart {
+ mode = "fail"
+ }
+
+ # The constraint allows restricting the set of eligible nodes. Constraints
+ # may filter on attributes or client metadata.
+ #
+ # For more information and examples on the "volume" stanza, please see
+ # the online documentation at:
+ #
+ # https://www.nomadproject.io/docs/job-specification/constraint
+ #
+ constraint {
+ attribute = "$${attr.cpu.arch}"
+ operator = "!="
+ value = "arm64"
+ }
+
+ constraint {
+ attribute = "$${node.class}"
+ value = "builder"
+ }
+
+ # The "task" stanza creates an individual unit of work, such as a Docker
+ # container, web application, or batch processing.
+ #
+ # https://www.nomadproject.io/docs/job-specification/task.html
+ #
+ task "${job_name}-coverage" {
+ # The artifact stanza instructs Nomad to fetch and unpack a remote
+ # resource, such as a file, tarball, or binary.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "git::https://github.com/pmikus/glue-etl-pyspark.git"
+ destination = "local/etl"
+ }
+
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${image}"
+ command = "gluesparksubmit"
+ args = [
+ "--driver-memory", "20g",
+ "--executor-memory", "20g",
+ "coverage_rls2202.py"
+ ]
+ work_dir = "/local/etl"
+ }
+
+ # The env stanza configures a list of environment variables to populate
+ # the task's environment before starting.
+ env {
+ AWS_ACCESS_KEY_ID = "${aws_access_key_id}"
+ AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}"
+ AWS_DEFAULT_REGION = "${aws_default_region}"
+ OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}"
+ OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}"
+ OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}"
+ ${ envs }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${memory}
+ }
+ }
+ task "${job_name}-iterative" {
+ # The artifact stanza instructs Nomad to fetch and unpack a remote
+ # resource, such as a file, tarball, or binary.
+ #
+ # https://www.nomadproject.io/docs/job-specification/artifact
+ #
+ artifact {
+ source = "git::https://github.com/pmikus/glue-etl-pyspark.git"
+ destination = "local/etl"
+ }
+
+ # The "driver" parameter specifies the task driver that should be used to
+ # run the task.
+ driver = "docker"
+
+ # The "config" stanza specifies the driver configuration, which is passed
+ # directly to the driver to start the task. The details of configurations
+ # are specific to each driver, so please see specific driver
+ # documentation for more information.
+ config {
+ image = "${image}"
+ command = "gluesparksubmit"
+ args = [
+ "--driver-memory", "20g",
+ "--executor-memory", "20g",
+ "iterative_rls2202.py"
+ ]
+ work_dir = "/local/etl"
+ }
+
+ # The env stanza configures a list of environment variables to populate
+ # the task's environment before starting.
+ env {
+ AWS_ACCESS_KEY_ID = "${aws_access_key_id}"
+ AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}"
+ AWS_DEFAULT_REGION = "${aws_default_region}"
+ OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}"
+ OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}"
+ OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}"
+ ${ envs }
+ }
+
+ # The "resources" stanza describes the requirements a task needs to
+ # execute. Resource requirements include memory, network, cpu, and more.
+ # This ensures the task will execute on a machine that contains enough
+ # resource capacity.
+ #
+ # https://www.nomadproject.io/docs/job-specification/resources
+ #
+ resources {
+ cpu = ${cpu}
+ memory = ${memory}
+ }
+ }
+ }
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf
new file mode 100644
index 0000000000..3d2026f0f9
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/fdio/main.tf
@@ -0,0 +1,23 @@
+data "vault_generic_secret" "fdio_logs" {
+ path = "kv/secret/data/etl/fdio_logs"
+}
+
+data "vault_generic_secret" "fdio_docs" {
+ path = "kv/secret/data/etl/fdio_docs"
+}
+
+module "etl" {
+ providers = {
+ nomad = nomad.yul1
+ }
+ source = "../"
+
+ aws_access_key_id = data.vault_generic_secret.fdio_logs.data["access_key"]
+ aws_secret_access_key = data.vault_generic_secret.fdio_logs.data["secret_key"]
+ aws_default_region = data.vault_generic_secret.fdio_logs.data["region"]
+ out_aws_access_key_id = data.vault_generic_secret.fdio_docs.data["access_key"]
+ out_aws_secret_access_key = data.vault_generic_secret.fdio_docs.data["secret_key"]
+ out_aws_default_region = data.vault_generic_secret.fdio_docs.data["region"]
+ cron = "@daily"
+ datacenters = ["yul1"]
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf
new file mode 100644
index 0000000000..c6617da02b
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/fdio/providers.tf
@@ -0,0 +1,13 @@
+provider "nomad" {
+ address = var.nomad_provider_address
+ alias = "yul1"
+ # ca_file = var.nomad_provider_ca_file
+ # cert_file = var.nomad_provider_cert_file
+ # key_file = var.nomad_provider_key_file
+}
+
+provider "vault" {
+ address = var.vault_provider_address
+ skip_tls_verify = var.vault_provider_skip_tls_verify
+ token = var.vault_provider_token
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf
new file mode 100644
index 0000000000..0e0b3af622
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/fdio/variables.tf
@@ -0,0 +1,47 @@
+variable "nomad_acl" {
+ description = "Nomad ACLs enabled/disabled."
+ type = bool
+ default = false
+}
+
+variable "nomad_provider_address" {
+ description = "FD.io Nomad cluster address."
+ type = string
+ default = "http://10.32.8.14:4646"
+}
+
+variable "nomad_provider_ca_file" {
+ description = "A local file path to a PEM-encoded certificate authority."
+ type = string
+ default = "/etc/nomad.d/ssl/nomad-ca.pem"
+}
+
+variable "nomad_provider_cert_file" {
+ description = "A local file path to a PEM-encoded certificate."
+ type = string
+ default = "/etc/nomad.d/ssl/nomad-cli.pem"
+}
+
+variable "nomad_provider_key_file" {
+ description = "A local file path to a PEM-encoded private key."
+ type = string
+ default = "/etc/nomad.d/ssl/nomad-cli-key.pem"
+}
+
+variable "vault_provider_address" {
+ description = "Vault cluster address."
+ type = string
+ default = "http://10.30.51.28:8200"
+}
+
+variable "vault_provider_skip_tls_verify" {
+ description = "Verification of the Vault server's TLS certificate."
+ type = bool
+ default = false
+}
+
+variable "vault_provider_token" {
+ description = "Vault root token."
+ type = string
+ sensitive = true
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf b/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf
new file mode 100644
index 0000000000..526e1d0df0
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/fdio/versions.tf
@@ -0,0 +1,17 @@
+terraform {
+ backend "consul" {
+ address = "10.32.8.14:8500"
+ scheme = "http"
+ path = "terraform/etl"
+ }
+ required_providers {
+ nomad = {
+ source = "hashicorp/nomad"
+ version = ">= 1.4.16"
+ }
+ vault = {
+ version = ">= 3.2.1"
+ }
+ }
+ required_version = ">= 1.1.4"
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/main.tf b/fdio.infra.terraform/1n_nmd/etl/main.tf
new file mode 100644
index 0000000000..c477da81a8
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/main.tf
@@ -0,0 +1,33 @@
+locals {
+ datacenters = join(",", var.datacenters)
+ envs = join("\n", concat([], var.envs))
+}
+
+resource "nomad_job" "nomad_job_etl" {
+ jobspec = templatefile(
+ "${path.module}/conf/nomad/etl.hcl.tftpl",
+ {
+ aws_access_key_id = var.aws_access_key_id,
+ aws_secret_access_key = var.aws_secret_access_key,
+ aws_default_region = var.aws_default_region
+ cpu = var.cpu,
+ cron = var.cron,
+ datacenters = local.datacenters,
+ envs = local.envs,
+ image = var.image,
+ job_name = var.job_name,
+ memory = var.memory,
+ out_aws_access_key_id = var.out_aws_access_key_id,
+ out_aws_secret_access_key = var.out_aws_secret_access_key,
+ out_aws_default_region = var.out_aws_default_region
+ prohibit_overlap = var.prohibit_overlap,
+ time_zone = var.time_zone,
+ type = var.type,
+ use_vault_provider = var.vault_secret.use_vault_provider,
+ vault_kv_policy_name = var.vault_secret.vault_kv_policy_name,
+ vault_kv_path = var.vault_secret.vault_kv_path,
+ vault_kv_field_access_key = var.vault_secret.vault_kv_field_access_key,
+ vault_kv_field_secret_key = var.vault_secret.vault_kv_field_secret_key
+ })
+ detach = false
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/variables.tf b/fdio.infra.terraform/1n_nmd/etl/variables.tf
new file mode 100644
index 0000000000..3c6c12a943
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/variables.tf
@@ -0,0 +1,115 @@
+# Nomad
+variable "datacenters" {
+ description = "Specifies the list of DCs to be considered placing this task."
+ type = list(string)
+ default = ["dc1"]
+}
+
+# ETL
+variable "aws_access_key_id" {
+ description = "AWS access key."
+ type = string
+ default = "aws"
+}
+
+variable "aws_secret_access_key" {
+ description = "AWS secret key"
+ type = string
+ default = "aws"
+}
+
+variable "aws_default_region" {
+ description = "AWS region"
+ type = string
+ default = "aws"
+}
+
+variable "cpu" {
+ description = "Specifies the CPU required to run this task in MHz."
+ type = number
+ default = 10000
+}
+
+variable "cron" {
+ description = "Specifies a cron expression configuring the interval to launch."
+ type = string
+ default = "@daily"
+}
+
+variable "envs" {
+ description = "Specifies ETL environment variables."
+ type = list(string)
+ default = []
+}
+
+variable "image" {
+ description = "Specifies the Docker image to run."
+ type = string
+ default = "pmikus/docker-ubuntu-focal-aws-glue:latest"
+}
+
+variable "job_name" {
+ description = "Specifies a name for the job."
+ type = string
+ default = "etl"
+}
+
+variable "memory" {
+ description = "Specifies the memory required in MB."
+ type = number
+ default = 20000
+}
+
+variable "out_aws_access_key_id" {
+ description = "AWS access key."
+ type = string
+ default = "aws"
+}
+
+variable "out_aws_secret_access_key" {
+ description = "AWS secret key"
+ type = string
+ default = "aws"
+}
+
+variable "out_aws_default_region" {
+ description = "AWS region"
+ type = string
+ default = "aws"
+}
+
+variable "prohibit_overlap" {
+ description = "Specifies if this job should wait until previous completed."
+ type = bool
+ default = true
+}
+
+variable "time_zone" {
+ description = "Specifies the time zone to evaluate the next launch interval."
+ type = string
+ default = "UTC"
+}
+
+variable "type" {
+ description = "Specifies the Nomad scheduler to use."
+ type = string
+ default = "batch"
+}
+
+variable "vault_secret" {
+ type = object({
+ use_vault_provider = bool,
+ vault_kv_policy_name = string,
+ vault_kv_path = string,
+ vault_kv_field_access_key = string,
+ vault_kv_field_secret_key = string
+ })
+ description = "Set of properties to be able to fetch secret from vault."
+ default = {
+ use_vault_provider = false
+ vault_kv_policy_name = "kv"
+ vault_kv_path = "secret/data/etl"
+ vault_kv_field_access_key = "access_key"
+ vault_kv_field_secret_key = "secret_key"
+ }
+}
diff --git a/fdio.infra.terraform/1n_nmd/etl/versions.tf b/fdio.infra.terraform/1n_nmd/etl/versions.tf
new file mode 100644
index 0000000000..a01708f28a
--- /dev/null
+++ b/fdio.infra.terraform/1n_nmd/etl/versions.tf
@@ -0,0 +1,9 @@
+terraform {
+ required_providers {
+ nomad = {
+ source = "hashicorp/nomad"
+ version = ">= 1.4.16"
+ }
+ }
+ required_version = ">= 1.1.4"
+}