From 54fd337e30acc97434b33a6d0d3c19e4aa3051ab Mon Sep 17 00:00:00 2001 From: Peter Mikus Date: Tue, 22 Feb 2022 11:00:47 +0100 Subject: feat(uti): etl Signed-off-by: Peter Mikus Change-Id: I7cdcdcbf1e4986664d5d48357688185319f67b0c --- .../1n_nmd/etl/conf/nomad/etl.hcl.tftpl | 318 +++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl (limited to 'fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl') diff --git a/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl b/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl new file mode 100644 index 0000000000..c1d186f681 --- /dev/null +++ b/fdio.infra.terraform/1n_nmd/etl/conf/nomad/etl.hcl.tftpl @@ -0,0 +1,318 @@ +job "${job_name}" { + # The "datacenters" parameter specifies the list of datacenters which should + # be considered when placing this task. This must be provided. + datacenters = "${datacenters}" + + # The "type" parameter controls the type of job, which impacts the scheduler's + # decision on placement. For a full list of job types and their differences, + # please see the online documentation. + # + # https://www.nomadproject.io/docs/jobspec/schedulers + # + type = "${type}" + + # The periodic stanza allows a job to run at fixed times, dates, or intervals. + # The easiest way to think about the periodic scheduler is "Nomad cron" or + # "distributed cron". + # + # https://www.nomadproject.io/docs/job-specification/periodic + # + periodic { + cron = "${cron}" + prohibit_overlap = "${prohibit_overlap}" + time_zone = "${time_zone}" + } + + # The "group" stanza defines a series of tasks that should be co-located on + # the same Nomad client. Any task within a group will be placed on the same + # client. + # + # https://www.nomadproject.io/docs/job-specification/group + # + group "${job_name}-master" { + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + mode = "fail" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task.html + # + task "${job_name}-trending" { + # The artifact stanza instructs Nomad to fetch and unpack a remote + # resource, such as a file, tarball, or binary. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "git::https://github.com/pmikus/glue-etl-pyspark.git" + destination = "local/etl" + } + + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "docker" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + image = "${image}" + command = "gluesparksubmit" + args = [ + "--driver-memory", "20g", + "--executor-memory", "20g", + "trending.py" + ] + work_dir = "/local/etl" + } + + # The env stanza configures a list of environment variables to populate + # the task's environment before starting. + env { + AWS_ACCESS_KEY_ID = "${aws_access_key_id}" + AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" + AWS_DEFAULT_REGION = "${aws_default_region}" + OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" + OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" + OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" + ${ envs } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + task "${job_name}-stats" { + # The artifact stanza instructs Nomad to fetch and unpack a remote + # resource, such as a file, tarball, or binary. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "git::https://github.com/pmikus/glue-etl-pyspark.git" + destination = "local/etl" + } + + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "docker" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + image = "${image}" + command = "gluesparksubmit" + args = [ + "--driver-memory", "10g", + "--executor-memory", "10g", + "stats.py" + ] + work_dir = "/local/etl" + } + + # The env stanza configures a list of environment variables to populate + # the task's environment before starting. + env { + AWS_ACCESS_KEY_ID = "${aws_access_key_id}" + AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" + AWS_DEFAULT_REGION = "${aws_default_region}" + OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" + OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" + OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" + ${ envs } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } + group "${job_name}-rls2202" { + # The restart stanza configures a tasks's behavior on task failure. Restarts + # happen on the client that is running the task. + # + # https://www.nomadproject.io/docs/job-specification/restart + # + restart { + mode = "fail" + } + + # The constraint allows restricting the set of eligible nodes. Constraints + # may filter on attributes or client metadata. + # + # For more information and examples on the "volume" stanza, please see + # the online documentation at: + # + # https://www.nomadproject.io/docs/job-specification/constraint + # + constraint { + attribute = "$${attr.cpu.arch}" + operator = "!=" + value = "arm64" + } + + constraint { + attribute = "$${node.class}" + value = "builder" + } + + # The "task" stanza creates an individual unit of work, such as a Docker + # container, web application, or batch processing. + # + # https://www.nomadproject.io/docs/job-specification/task.html + # + task "${job_name}-coverage" { + # The artifact stanza instructs Nomad to fetch and unpack a remote + # resource, such as a file, tarball, or binary. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "git::https://github.com/pmikus/glue-etl-pyspark.git" + destination = "local/etl" + } + + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "docker" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + image = "${image}" + command = "gluesparksubmit" + args = [ + "--driver-memory", "20g", + "--executor-memory", "20g", + "coverage_rls2202.py" + ] + work_dir = "/local/etl" + } + + # The env stanza configures a list of environment variables to populate + # the task's environment before starting. + env { + AWS_ACCESS_KEY_ID = "${aws_access_key_id}" + AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" + AWS_DEFAULT_REGION = "${aws_default_region}" + OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" + OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" + OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" + ${ envs } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + task "${job_name}-iterative" { + # The artifact stanza instructs Nomad to fetch and unpack a remote + # resource, such as a file, tarball, or binary. + # + # https://www.nomadproject.io/docs/job-specification/artifact + # + artifact { + source = "git::https://github.com/pmikus/glue-etl-pyspark.git" + destination = "local/etl" + } + + # The "driver" parameter specifies the task driver that should be used to + # run the task. + driver = "docker" + + # The "config" stanza specifies the driver configuration, which is passed + # directly to the driver to start the task. The details of configurations + # are specific to each driver, so please see specific driver + # documentation for more information. + config { + image = "${image}" + command = "gluesparksubmit" + args = [ + "--driver-memory", "20g", + "--executor-memory", "20g", + "iterative_rls2202.py" + ] + work_dir = "/local/etl" + } + + # The env stanza configures a list of environment variables to populate + # the task's environment before starting. + env { + AWS_ACCESS_KEY_ID = "${aws_access_key_id}" + AWS_SECRET_ACCESS_KEY = "${aws_secret_access_key}" + AWS_DEFAULT_REGION = "${aws_default_region}" + OUT_AWS_ACCESS_KEY_ID = "${out_aws_access_key_id}" + OUT_AWS_SECRET_ACCESS_KEY = "${out_aws_secret_access_key}" + OUT_AWS_DEFAULT_REGION = "${out_aws_default_region}" + ${ envs } + } + + # The "resources" stanza describes the requirements a task needs to + # execute. Resource requirements include memory, network, cpu, and more. + # This ensures the task will execute on a machine that contains enough + # resource capacity. + # + # https://www.nomadproject.io/docs/job-specification/resources + # + resources { + cpu = ${cpu} + memory = ${memory} + } + } + } +} -- cgit 1.2.3-korg